From 866c63d90c75df4b073492963c73ada40fa5e6b7 Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Mon, 1 Jun 2026 02:31:19 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: Neelectric/Llama-3.1-8B-Instruct_SFT_sciencev00.07 Source: Original Platform --- .gitattributes | 36 + README.md | 59 + all_results.json | 8 + chat_template.jinja | 121 + config.json | 35 + generation_config.json | 8 + model-00001-of-00004.safetensors | 3 + model-00002-of-00004.safetensors | 3 + model-00003-of-00004.safetensors | 3 + model-00004-of-00004.safetensors | 3 + model.safetensors.index.json | 299 + special_tokens_map.json | 10 + tokenizer.json | 3 + tokenizer_config.json | 2062 + train_results.json | 8 + trainer_state.json | 163951 ++++++++++++++++++++++++++++ training_args.bin | 3 + 17 files changed, 166615 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 all_results.json create mode 100644 chat_template.jinja create mode 100644 config.json create mode 100644 generation_config.json create mode 100644 model-00001-of-00004.safetensors create mode 100644 model-00002-of-00004.safetensors create mode 100644 model-00003-of-00004.safetensors create mode 100644 model-00004-of-00004.safetensors create mode 100644 model.safetensors.index.json create mode 100644 special_tokens_map.json create mode 100644 tokenizer.json create mode 100644 tokenizer_config.json create mode 100644 train_results.json create mode 100644 trainer_state.json create mode 100644 training_args.bin diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..f54dcd6 --- /dev/null +++ b/README.md @@ -0,0 +1,59 @@ +--- +base_model: meta-llama/Llama-3.1-8B-Instruct +datasets: Neelectric/Replay_0.03.MoT_science.wildguardmix_reasoning.Llama3_4096toks +library_name: transformers +model_name: Llama-3.1-8B-Instruct_SFT_sciencev00.07 +tags: +- generated_from_trainer +- sft +- open-r1 +- trl +licence: license +--- + +# Model Card for Llama-3.1-8B-Instruct_SFT_sciencev00.07 + +This model is a fine-tuned version of [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) on the [Neelectric/Replay_0.03.MoT_science.wildguardmix_reasoning.Llama3_4096toks](https://huggingface.co/datasets/Neelectric/Replay_0.03.MoT_science.wildguardmix_reasoning.Llama3_4096toks) dataset. +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="Neelectric/Llama-3.1-8B-Instruct_SFT_sciencev00.07", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/neelectric/open-r1_science/runs/aya2tyqz) + + +This model was trained with SFT. + +### Framework versions + +- TRL: 0.28.0.dev0 +- Transformers: 4.57.6 +- Pytorch: 2.9.0 +- Datasets: 4.5.0 +- Tokenizers: 0.22.2 + +## Citations + + + +Cite TRL as: + +```bibtex +@software{vonwerra2020trl, + title = {{TRL: Transformers Reinforcement Learning}}, + author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, + license = {Apache-2.0}, + url = {https://github.com/huggingface/trl}, + year = {2020} +} +``` \ No newline at end of file diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..bea5e83 --- /dev/null +++ b/all_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 2.0766464617520038e+19, + "train_loss": 0.9501410755198944, + "train_runtime": 21390.3581, + "train_samples": 145693, + "train_samples_per_second": 13.622, + "train_steps_per_second": 0.851 +} \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..0ab931a --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,121 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- set date_string = "26 Jul 2024" %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: +... + + +... +" %} +{%- endif %} + +{#- System message + builtin tools #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if builtin_tools is defined or tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{%- if builtin_tools is defined %} + {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} + {%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {%- if message['role'] == 'assistant' %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} + {% generation %} + {{- message['content'] | trim + '<|eot_id|>' }} + {% endgeneration %} + {%- else %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- endif %} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {% generation %} + {%- if builtin_tools is defined and tool_call.name in builtin_tools %} + {{- "<|python_tag|>" + tool_call.name + ".call(" }} + {%- for arg_name, arg_val in tool_call.arguments | items %} + {{- arg_name + '="' + arg_val + '"' }} + {%- if not loop.last %} + {{- ", " }} + {%- endif %} + {%- endfor %} + {{- ")" }} + {%- else %} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {%- endif %} + {%- if builtin_tools is defined %} + {{- "<|eom_id|>" }} + {%- else %} + {{- "<|eot_id|>" }} + {%- endif %} + {% endgeneration %} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..e1d9068 --- /dev/null +++ b/config.json @@ -0,0 +1,35 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "bfloat16", + "eos_token_id": 128009, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "transformers_version": "4.57.6", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..1996dc1 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,8 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": 128009, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.57.6" +} diff --git a/model-00001-of-00004.safetensors b/model-00001-of-00004.safetensors new file mode 100644 index 0000000..60516fa --- /dev/null +++ b/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:baf32706ca9b4044763075cca315eeb1aa20f8bddaee9bb1a832a1baa38c58bb +size 4976698672 diff --git a/model-00002-of-00004.safetensors b/model-00002-of-00004.safetensors new file mode 100644 index 0000000..347ce13 --- /dev/null +++ b/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1649e82a976653967706b7b9bc7b9066b070f7935325cba01f3f0b52437a5af +size 4999802720 diff --git a/model-00003-of-00004.safetensors b/model-00003-of-00004.safetensors new file mode 100644 index 0000000..2a27edb --- /dev/null +++ b/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ee9a00b27cf2601e38443dae2c2ca7cb36250b7aafe4414e50cad85991c7af5 +size 4915916176 diff --git a/model-00004-of-00004.safetensors b/model-00004-of-00004.safetensors new file mode 100644 index 0000000..15e68f9 --- /dev/null +++ b/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22019e1f6d5a842d6178fe29b0aa97899c5a0351889ef93157dadaa06bb75fa5 +size 1168138808 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..5c64f1e --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,299 @@ +{ + "metadata": { + "total_parameters": 8030261248, + "total_size": 16060522496 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.norm.weight": "model-00004-of-00004.safetensors" + } +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..e8f05fa --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,10 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": "<|eot_id|>" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..1c1d8d5 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..8b0c7c1 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2062 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..bea5e83 --- /dev/null +++ b/train_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 2.0766464617520038e+19, + "train_loss": 0.9501410755198944, + "train_runtime": 21390.3581, + "train_samples": 145693, + "train_samples_per_second": 13.622, + "train_steps_per_second": 0.851 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..e2849cf --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,163951 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 18212, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00010981770261366133, + "grad_norm": 7.30096960067749, + "learning_rate": 0.0, + "loss": 1.4841, + "mean_token_accuracy": 0.6132287979125977, + "num_tokens": 25099.0, + "step": 1 + }, + { + "epoch": 0.00021963540522732265, + "grad_norm": 6.588904857635498, + "learning_rate": 5.48847420417124e-10, + "loss": 1.4857, + "mean_token_accuracy": 0.6123285293579102, + "num_tokens": 54691.0, + "step": 2 + }, + { + "epoch": 0.000329453107840984, + "grad_norm": 7.019151210784912, + "learning_rate": 1.097694840834248e-09, + "loss": 1.4791, + "mean_token_accuracy": 0.6070127487182617, + "num_tokens": 79603.0, + "step": 3 + }, + { + "epoch": 0.0004392708104546453, + "grad_norm": 7.5902099609375, + "learning_rate": 1.646542261251372e-09, + "loss": 1.56, + "mean_token_accuracy": 0.6013427972793579, + "num_tokens": 104718.0, + "step": 4 + }, + { + "epoch": 0.0005490885130683066, + "grad_norm": 7.7578840255737305, + "learning_rate": 2.195389681668496e-09, + "loss": 1.5329, + "mean_token_accuracy": 0.614738941192627, + "num_tokens": 126592.0, + "step": 5 + }, + { + "epoch": 0.000658906215681968, + "grad_norm": 7.769545555114746, + "learning_rate": 2.7442371020856205e-09, + "loss": 1.4751, + "mean_token_accuracy": 0.6179271340370178, + "num_tokens": 149116.0, + "step": 6 + }, + { + "epoch": 0.0007687239182956292, + "grad_norm": 6.9148969650268555, + "learning_rate": 3.293084522502744e-09, + "loss": 1.4527, + "mean_token_accuracy": 0.6184134483337402, + "num_tokens": 175566.0, + "step": 7 + }, + { + "epoch": 0.0008785416209092906, + "grad_norm": 7.937089920043945, + "learning_rate": 3.841931942919868e-09, + "loss": 1.5506, + "mean_token_accuracy": 0.5958436727523804, + "num_tokens": 198562.0, + "step": 8 + }, + { + "epoch": 0.000988359323522952, + "grad_norm": 6.677840709686279, + "learning_rate": 4.390779363336992e-09, + "loss": 1.3833, + "mean_token_accuracy": 0.6345700621604919, + "num_tokens": 225755.0, + "step": 9 + }, + { + "epoch": 0.0010981770261366132, + "grad_norm": 8.31437873840332, + "learning_rate": 4.939626783754116e-09, + "loss": 1.6379, + "mean_token_accuracy": 0.5840341448783875, + "num_tokens": 247850.0, + "step": 10 + }, + { + "epoch": 0.0012079947287502745, + "grad_norm": 6.185459613800049, + "learning_rate": 5.488474204171241e-09, + "loss": 1.5231, + "mean_token_accuracy": 0.606008768081665, + "num_tokens": 283593.0, + "step": 11 + }, + { + "epoch": 0.001317812431363936, + "grad_norm": 7.3887152671813965, + "learning_rate": 6.0373216245883644e-09, + "loss": 1.5305, + "mean_token_accuracy": 0.6092687249183655, + "num_tokens": 307076.0, + "step": 12 + }, + { + "epoch": 0.0014276301339775973, + "grad_norm": 6.815072059631348, + "learning_rate": 6.586169045005488e-09, + "loss": 1.4923, + "mean_token_accuracy": 0.6139775514602661, + "num_tokens": 334008.0, + "step": 13 + }, + { + "epoch": 0.0015374478365912585, + "grad_norm": 7.32658052444458, + "learning_rate": 7.135016465422612e-09, + "loss": 1.5262, + "mean_token_accuracy": 0.6075522899627686, + "num_tokens": 359610.0, + "step": 14 + }, + { + "epoch": 0.0016472655392049198, + "grad_norm": 6.286005973815918, + "learning_rate": 7.683863885839736e-09, + "loss": 1.5065, + "mean_token_accuracy": 0.6064472198486328, + "num_tokens": 394686.0, + "step": 15 + }, + { + "epoch": 0.0017570832418185812, + "grad_norm": 7.518366813659668, + "learning_rate": 8.23271130625686e-09, + "loss": 1.4102, + "mean_token_accuracy": 0.6233112812042236, + "num_tokens": 418738.0, + "step": 16 + }, + { + "epoch": 0.0018669009444322424, + "grad_norm": 7.024853706359863, + "learning_rate": 8.781558726673984e-09, + "loss": 1.4587, + "mean_token_accuracy": 0.6144721508026123, + "num_tokens": 443811.0, + "step": 17 + }, + { + "epoch": 0.001976718647045904, + "grad_norm": 8.676950454711914, + "learning_rate": 9.330406147091108e-09, + "loss": 1.5179, + "mean_token_accuracy": 0.6100056767463684, + "num_tokens": 463946.0, + "step": 18 + }, + { + "epoch": 0.002086536349659565, + "grad_norm": 7.51622200012207, + "learning_rate": 9.879253567508231e-09, + "loss": 1.4568, + "mean_token_accuracy": 0.6115257740020752, + "num_tokens": 485721.0, + "step": 19 + }, + { + "epoch": 0.0021963540522732263, + "grad_norm": 7.987913131713867, + "learning_rate": 1.0428100987925357e-08, + "loss": 1.5382, + "mean_token_accuracy": 0.6070640087127686, + "num_tokens": 508528.0, + "step": 20 + }, + { + "epoch": 0.002306171754886888, + "grad_norm": 7.707884788513184, + "learning_rate": 1.0976948408342482e-08, + "loss": 1.586, + "mean_token_accuracy": 0.594460129737854, + "num_tokens": 531846.0, + "step": 21 + }, + { + "epoch": 0.002415989457500549, + "grad_norm": 7.018512725830078, + "learning_rate": 1.1525795828759604e-08, + "loss": 1.4027, + "mean_token_accuracy": 0.6366407871246338, + "num_tokens": 556969.0, + "step": 22 + }, + { + "epoch": 0.0025258071601142102, + "grad_norm": 6.632835388183594, + "learning_rate": 1.2074643249176729e-08, + "loss": 1.5046, + "mean_token_accuracy": 0.6117215156555176, + "num_tokens": 585584.0, + "step": 23 + }, + { + "epoch": 0.002635624862727872, + "grad_norm": 7.110255241394043, + "learning_rate": 1.2623490669593852e-08, + "loss": 1.5104, + "mean_token_accuracy": 0.6058076024055481, + "num_tokens": 611327.0, + "step": 24 + }, + { + "epoch": 0.002745442565341533, + "grad_norm": 6.253136157989502, + "learning_rate": 1.3172338090010976e-08, + "loss": 1.3434, + "mean_token_accuracy": 0.6475535035133362, + "num_tokens": 640555.0, + "step": 25 + }, + { + "epoch": 0.0028552602679551946, + "grad_norm": 6.2889814376831055, + "learning_rate": 1.3721185510428101e-08, + "loss": 1.5174, + "mean_token_accuracy": 0.6018496751785278, + "num_tokens": 673137.0, + "step": 26 + }, + { + "epoch": 0.0029650779705688557, + "grad_norm": 6.260401248931885, + "learning_rate": 1.4270032930845225e-08, + "loss": 1.5201, + "mean_token_accuracy": 0.6013709306716919, + "num_tokens": 704633.0, + "step": 27 + }, + { + "epoch": 0.003074895673182517, + "grad_norm": 7.015823841094971, + "learning_rate": 1.4818880351262348e-08, + "loss": 1.501, + "mean_token_accuracy": 0.6116892099380493, + "num_tokens": 730399.0, + "step": 28 + }, + { + "epoch": 0.0031847133757961785, + "grad_norm": 7.725862503051758, + "learning_rate": 1.536772777167947e-08, + "loss": 1.5122, + "mean_token_accuracy": 0.608163595199585, + "num_tokens": 749148.0, + "step": 29 + }, + { + "epoch": 0.0032945310784098397, + "grad_norm": 7.384652614593506, + "learning_rate": 1.5916575192096597e-08, + "loss": 1.5085, + "mean_token_accuracy": 0.6175290942192078, + "num_tokens": 772676.0, + "step": 30 + }, + { + "epoch": 0.003404348781023501, + "grad_norm": 6.971219539642334, + "learning_rate": 1.646542261251372e-08, + "loss": 1.5109, + "mean_token_accuracy": 0.6111197471618652, + "num_tokens": 800694.0, + "step": 31 + }, + { + "epoch": 0.0035141664836371624, + "grad_norm": 6.877848148345947, + "learning_rate": 1.7014270032930844e-08, + "loss": 1.498, + "mean_token_accuracy": 0.6083377003669739, + "num_tokens": 829694.0, + "step": 32 + }, + { + "epoch": 0.0036239841862508236, + "grad_norm": 7.352473258972168, + "learning_rate": 1.756311745334797e-08, + "loss": 1.4517, + "mean_token_accuracy": 0.6213776469230652, + "num_tokens": 854651.0, + "step": 33 + }, + { + "epoch": 0.0037338018888644848, + "grad_norm": 7.012495994567871, + "learning_rate": 1.811196487376509e-08, + "loss": 1.4439, + "mean_token_accuracy": 0.6199117302894592, + "num_tokens": 880665.0, + "step": 34 + }, + { + "epoch": 0.0038436195914781464, + "grad_norm": 6.412200927734375, + "learning_rate": 1.8660812294182216e-08, + "loss": 1.4573, + "mean_token_accuracy": 0.6160067915916443, + "num_tokens": 910985.0, + "step": 35 + }, + { + "epoch": 0.003953437294091808, + "grad_norm": 6.179067611694336, + "learning_rate": 1.920965971459934e-08, + "loss": 1.4302, + "mean_token_accuracy": 0.6316869258880615, + "num_tokens": 940555.0, + "step": 36 + }, + { + "epoch": 0.004063254996705469, + "grad_norm": 6.833353519439697, + "learning_rate": 1.9758507135016463e-08, + "loss": 1.5077, + "mean_token_accuracy": 0.6085662841796875, + "num_tokens": 968629.0, + "step": 37 + }, + { + "epoch": 0.00417307269931913, + "grad_norm": 8.323586463928223, + "learning_rate": 2.030735455543359e-08, + "loss": 1.5401, + "mean_token_accuracy": 0.5996390581130981, + "num_tokens": 989856.0, + "step": 38 + }, + { + "epoch": 0.004282890401932792, + "grad_norm": 8.070658683776855, + "learning_rate": 2.0856201975850713e-08, + "loss": 1.5025, + "mean_token_accuracy": 0.6162833571434021, + "num_tokens": 1011899.0, + "step": 39 + }, + { + "epoch": 0.004392708104546453, + "grad_norm": 7.304897308349609, + "learning_rate": 2.1405049396267835e-08, + "loss": 1.4835, + "mean_token_accuracy": 0.613460898399353, + "num_tokens": 1036113.0, + "step": 40 + }, + { + "epoch": 0.004502525807160114, + "grad_norm": 6.455310821533203, + "learning_rate": 2.1953896816684964e-08, + "loss": 1.5876, + "mean_token_accuracy": 0.5887376070022583, + "num_tokens": 1068313.0, + "step": 41 + }, + { + "epoch": 0.004612343509773776, + "grad_norm": 7.99082612991333, + "learning_rate": 2.2502744237102085e-08, + "loss": 1.4995, + "mean_token_accuracy": 0.6211690306663513, + "num_tokens": 1090214.0, + "step": 42 + }, + { + "epoch": 0.0047221612123874365, + "grad_norm": 7.2630486488342285, + "learning_rate": 2.3051591657519207e-08, + "loss": 1.3967, + "mean_token_accuracy": 0.6404081583023071, + "num_tokens": 1114488.0, + "step": 43 + }, + { + "epoch": 0.004831978915001098, + "grad_norm": 7.910749435424805, + "learning_rate": 2.3600439077936336e-08, + "loss": 1.4853, + "mean_token_accuracy": 0.6181104183197021, + "num_tokens": 1135710.0, + "step": 44 + }, + { + "epoch": 0.00494179661761476, + "grad_norm": 8.323529243469238, + "learning_rate": 2.4149286498353458e-08, + "loss": 1.5886, + "mean_token_accuracy": 0.5910241603851318, + "num_tokens": 1155858.0, + "step": 45 + }, + { + "epoch": 0.0050516143202284204, + "grad_norm": 7.020382881164551, + "learning_rate": 2.469813391877058e-08, + "loss": 1.4906, + "mean_token_accuracy": 0.6072943806648254, + "num_tokens": 1183077.0, + "step": 46 + }, + { + "epoch": 0.005161432022842082, + "grad_norm": 7.525810718536377, + "learning_rate": 2.5246981339187705e-08, + "loss": 1.3741, + "mean_token_accuracy": 0.6378620266914368, + "num_tokens": 1205251.0, + "step": 47 + }, + { + "epoch": 0.005271249725455744, + "grad_norm": 7.046121597290039, + "learning_rate": 2.579582875960483e-08, + "loss": 1.5164, + "mean_token_accuracy": 0.603619396686554, + "num_tokens": 1233061.0, + "step": 48 + }, + { + "epoch": 0.005381067428069404, + "grad_norm": 6.286274433135986, + "learning_rate": 2.6344676180021952e-08, + "loss": 1.4035, + "mean_token_accuracy": 0.6290103197097778, + "num_tokens": 1263257.0, + "step": 49 + }, + { + "epoch": 0.005490885130683066, + "grad_norm": 6.973885536193848, + "learning_rate": 2.6893523600439077e-08, + "loss": 1.4182, + "mean_token_accuracy": 0.6311417818069458, + "num_tokens": 1290400.0, + "step": 50 + }, + { + "epoch": 0.0056007028332967276, + "grad_norm": 8.457390785217285, + "learning_rate": 2.7442371020856202e-08, + "loss": 1.4826, + "mean_token_accuracy": 0.6093466281890869, + "num_tokens": 1309792.0, + "step": 51 + }, + { + "epoch": 0.005710520535910389, + "grad_norm": 7.20143461227417, + "learning_rate": 2.7991218441273324e-08, + "loss": 1.4306, + "mean_token_accuracy": 0.6246258616447449, + "num_tokens": 1333227.0, + "step": 52 + }, + { + "epoch": 0.00582033823852405, + "grad_norm": 7.498056888580322, + "learning_rate": 2.854006586169045e-08, + "loss": 1.47, + "mean_token_accuracy": 0.6228734254837036, + "num_tokens": 1355812.0, + "step": 53 + }, + { + "epoch": 0.0059301559411377115, + "grad_norm": 7.396664142608643, + "learning_rate": 2.9088913282107574e-08, + "loss": 1.5111, + "mean_token_accuracy": 0.6081197261810303, + "num_tokens": 1380205.0, + "step": 54 + }, + { + "epoch": 0.006039973643751373, + "grad_norm": 6.474167346954346, + "learning_rate": 2.9637760702524696e-08, + "loss": 1.4634, + "mean_token_accuracy": 0.6158449649810791, + "num_tokens": 1411556.0, + "step": 55 + }, + { + "epoch": 0.006149791346365034, + "grad_norm": 6.003460884094238, + "learning_rate": 3.018660812294182e-08, + "loss": 1.5529, + "mean_token_accuracy": 0.6004980802536011, + "num_tokens": 1446822.0, + "step": 56 + }, + { + "epoch": 0.006259609048978695, + "grad_norm": 7.868234157562256, + "learning_rate": 3.073545554335894e-08, + "loss": 1.5336, + "mean_token_accuracy": 0.5997437834739685, + "num_tokens": 1468712.0, + "step": 57 + }, + { + "epoch": 0.006369426751592357, + "grad_norm": 7.96655797958374, + "learning_rate": 3.128430296377607e-08, + "loss": 1.4883, + "mean_token_accuracy": 0.6132872700691223, + "num_tokens": 1490433.0, + "step": 58 + }, + { + "epoch": 0.006479244454206018, + "grad_norm": 7.5497050285339355, + "learning_rate": 3.1833150384193193e-08, + "loss": 1.4893, + "mean_token_accuracy": 0.6169414520263672, + "num_tokens": 1514832.0, + "step": 59 + }, + { + "epoch": 0.006589062156819679, + "grad_norm": 6.5545220375061035, + "learning_rate": 3.2381997804610315e-08, + "loss": 1.4587, + "mean_token_accuracy": 0.621928334236145, + "num_tokens": 1543956.0, + "step": 60 + }, + { + "epoch": 0.006698879859433341, + "grad_norm": 6.858253479003906, + "learning_rate": 3.293084522502744e-08, + "loss": 1.488, + "mean_token_accuracy": 0.6071574091911316, + "num_tokens": 1571567.0, + "step": 61 + }, + { + "epoch": 0.006808697562047002, + "grad_norm": 6.897509574890137, + "learning_rate": 3.3479692645444566e-08, + "loss": 1.5111, + "mean_token_accuracy": 0.618494987487793, + "num_tokens": 1599309.0, + "step": 62 + }, + { + "epoch": 0.006918515264660663, + "grad_norm": 8.709039688110352, + "learning_rate": 3.402854006586169e-08, + "loss": 1.4631, + "mean_token_accuracy": 0.6240957975387573, + "num_tokens": 1618837.0, + "step": 63 + }, + { + "epoch": 0.007028332967274325, + "grad_norm": 7.848440170288086, + "learning_rate": 3.4577387486278816e-08, + "loss": 1.5112, + "mean_token_accuracy": 0.6118676066398621, + "num_tokens": 1640074.0, + "step": 64 + }, + { + "epoch": 0.007138150669887986, + "grad_norm": 7.51212739944458, + "learning_rate": 3.512623490669594e-08, + "loss": 1.5879, + "mean_token_accuracy": 0.6031206846237183, + "num_tokens": 1664200.0, + "step": 65 + }, + { + "epoch": 0.007247968372501647, + "grad_norm": 6.773792743682861, + "learning_rate": 3.567508232711306e-08, + "loss": 1.4525, + "mean_token_accuracy": 0.6139948964118958, + "num_tokens": 1687747.0, + "step": 66 + }, + { + "epoch": 0.007357786075115309, + "grad_norm": 6.73121976852417, + "learning_rate": 3.622392974753018e-08, + "loss": 1.4926, + "mean_token_accuracy": 0.6108168959617615, + "num_tokens": 1713976.0, + "step": 67 + }, + { + "epoch": 0.0074676037777289695, + "grad_norm": 7.711812496185303, + "learning_rate": 3.677277716794731e-08, + "loss": 1.5555, + "mean_token_accuracy": 0.6020510792732239, + "num_tokens": 1737277.0, + "step": 68 + }, + { + "epoch": 0.007577421480342631, + "grad_norm": 6.840099334716797, + "learning_rate": 3.732162458836443e-08, + "loss": 1.5118, + "mean_token_accuracy": 0.6028223633766174, + "num_tokens": 1764592.0, + "step": 69 + }, + { + "epoch": 0.007687239182956293, + "grad_norm": 6.095495223999023, + "learning_rate": 3.787047200878156e-08, + "loss": 1.3888, + "mean_token_accuracy": 0.6353126764297485, + "num_tokens": 1796132.0, + "step": 70 + }, + { + "epoch": 0.007797056885569954, + "grad_norm": 7.043576717376709, + "learning_rate": 3.841931942919868e-08, + "loss": 1.5527, + "mean_token_accuracy": 0.6142816543579102, + "num_tokens": 1819398.0, + "step": 71 + }, + { + "epoch": 0.007906874588183616, + "grad_norm": 7.239400863647461, + "learning_rate": 3.8968166849615804e-08, + "loss": 1.5973, + "mean_token_accuracy": 0.590431272983551, + "num_tokens": 1845429.0, + "step": 72 + }, + { + "epoch": 0.008016692290797276, + "grad_norm": 7.2331743240356445, + "learning_rate": 3.9517014270032926e-08, + "loss": 1.5592, + "mean_token_accuracy": 0.5938911437988281, + "num_tokens": 1869736.0, + "step": 73 + }, + { + "epoch": 0.008126509993410937, + "grad_norm": 7.242239475250244, + "learning_rate": 4.006586169045005e-08, + "loss": 1.4355, + "mean_token_accuracy": 0.6178789138793945, + "num_tokens": 1893333.0, + "step": 74 + }, + { + "epoch": 0.008236327696024599, + "grad_norm": 7.48700475692749, + "learning_rate": 4.061470911086718e-08, + "loss": 1.5374, + "mean_token_accuracy": 0.5994501709938049, + "num_tokens": 1916227.0, + "step": 75 + }, + { + "epoch": 0.00834614539863826, + "grad_norm": 6.7296977043151855, + "learning_rate": 4.1163556531284305e-08, + "loss": 1.4624, + "mean_token_accuracy": 0.6143727898597717, + "num_tokens": 1943995.0, + "step": 76 + }, + { + "epoch": 0.008455963101251922, + "grad_norm": 6.246706008911133, + "learning_rate": 4.1712403951701427e-08, + "loss": 1.4508, + "mean_token_accuracy": 0.6281577348709106, + "num_tokens": 1973005.0, + "step": 77 + }, + { + "epoch": 0.008565780803865584, + "grad_norm": 6.974816799163818, + "learning_rate": 4.226125137211855e-08, + "loss": 1.4917, + "mean_token_accuracy": 0.6136125922203064, + "num_tokens": 1996834.0, + "step": 78 + }, + { + "epoch": 0.008675598506479244, + "grad_norm": 6.40219783782959, + "learning_rate": 4.281009879253567e-08, + "loss": 1.5006, + "mean_token_accuracy": 0.6025249361991882, + "num_tokens": 2026909.0, + "step": 79 + }, + { + "epoch": 0.008785416209092905, + "grad_norm": 7.053531646728516, + "learning_rate": 4.335894621295279e-08, + "loss": 1.43, + "mean_token_accuracy": 0.626285970211029, + "num_tokens": 2050644.0, + "step": 80 + }, + { + "epoch": 0.008895233911706567, + "grad_norm": 6.339474201202393, + "learning_rate": 4.390779363336993e-08, + "loss": 1.5215, + "mean_token_accuracy": 0.6059188842773438, + "num_tokens": 2081518.0, + "step": 81 + }, + { + "epoch": 0.009005051614320228, + "grad_norm": 7.011707782745361, + "learning_rate": 4.445664105378705e-08, + "loss": 1.5472, + "mean_token_accuracy": 0.5930252075195312, + "num_tokens": 2106590.0, + "step": 82 + }, + { + "epoch": 0.00911486931693389, + "grad_norm": 7.902180194854736, + "learning_rate": 4.500548847420417e-08, + "loss": 1.454, + "mean_token_accuracy": 0.6231611967086792, + "num_tokens": 2127161.0, + "step": 83 + }, + { + "epoch": 0.009224687019547552, + "grad_norm": 6.171265125274658, + "learning_rate": 4.555433589462129e-08, + "loss": 1.4238, + "mean_token_accuracy": 0.6300718188285828, + "num_tokens": 2154089.0, + "step": 84 + }, + { + "epoch": 0.009334504722161213, + "grad_norm": 7.001099109649658, + "learning_rate": 4.6103183315038415e-08, + "loss": 1.5609, + "mean_token_accuracy": 0.5908949375152588, + "num_tokens": 2179741.0, + "step": 85 + }, + { + "epoch": 0.009444322424774873, + "grad_norm": 7.402109146118164, + "learning_rate": 4.6652030735455537e-08, + "loss": 1.4331, + "mean_token_accuracy": 0.6273126602172852, + "num_tokens": 2201306.0, + "step": 86 + }, + { + "epoch": 0.009554140127388535, + "grad_norm": 6.573662757873535, + "learning_rate": 4.720087815587267e-08, + "loss": 1.4775, + "mean_token_accuracy": 0.609107255935669, + "num_tokens": 2229038.0, + "step": 87 + }, + { + "epoch": 0.009663957830002196, + "grad_norm": 6.345874309539795, + "learning_rate": 4.7749725576289793e-08, + "loss": 1.4165, + "mean_token_accuracy": 0.6277764439582825, + "num_tokens": 2253513.0, + "step": 88 + }, + { + "epoch": 0.009773775532615858, + "grad_norm": 7.721385478973389, + "learning_rate": 4.8298572996706915e-08, + "loss": 1.4188, + "mean_token_accuracy": 0.6242532134056091, + "num_tokens": 2273969.0, + "step": 89 + }, + { + "epoch": 0.00988359323522952, + "grad_norm": 5.779064655303955, + "learning_rate": 4.884742041712404e-08, + "loss": 1.3855, + "mean_token_accuracy": 0.6349644660949707, + "num_tokens": 2302023.0, + "step": 90 + }, + { + "epoch": 0.009993410937843181, + "grad_norm": 6.387057781219482, + "learning_rate": 4.939626783754116e-08, + "loss": 1.4611, + "mean_token_accuracy": 0.6204542517662048, + "num_tokens": 2327887.0, + "step": 91 + }, + { + "epoch": 0.010103228640456841, + "grad_norm": 5.600723743438721, + "learning_rate": 4.994511525795828e-08, + "loss": 1.3815, + "mean_token_accuracy": 0.6347456574440002, + "num_tokens": 2356053.0, + "step": 92 + }, + { + "epoch": 0.010213046343070502, + "grad_norm": 5.922283172607422, + "learning_rate": 5.049396267837541e-08, + "loss": 1.3942, + "mean_token_accuracy": 0.6323148012161255, + "num_tokens": 2384482.0, + "step": 93 + }, + { + "epoch": 0.010322864045684164, + "grad_norm": 6.321078300476074, + "learning_rate": 5.104281009879254e-08, + "loss": 1.461, + "mean_token_accuracy": 0.6101053357124329, + "num_tokens": 2411179.0, + "step": 94 + }, + { + "epoch": 0.010432681748297826, + "grad_norm": 6.359566688537598, + "learning_rate": 5.159165751920966e-08, + "loss": 1.4268, + "mean_token_accuracy": 0.6202911138534546, + "num_tokens": 2438694.0, + "step": 95 + }, + { + "epoch": 0.010542499450911487, + "grad_norm": 7.025219917297363, + "learning_rate": 5.214050493962678e-08, + "loss": 1.4683, + "mean_token_accuracy": 0.6118279695510864, + "num_tokens": 2461633.0, + "step": 96 + }, + { + "epoch": 0.010652317153525149, + "grad_norm": 6.061488628387451, + "learning_rate": 5.2689352360043903e-08, + "loss": 1.4636, + "mean_token_accuracy": 0.6108254194259644, + "num_tokens": 2491067.0, + "step": 97 + }, + { + "epoch": 0.010762134856138809, + "grad_norm": 6.704017162322998, + "learning_rate": 5.3238199780461025e-08, + "loss": 1.4574, + "mean_token_accuracy": 0.6240578293800354, + "num_tokens": 2513963.0, + "step": 98 + }, + { + "epoch": 0.01087195255875247, + "grad_norm": 6.961119651794434, + "learning_rate": 5.3787047200878154e-08, + "loss": 1.502, + "mean_token_accuracy": 0.6081093549728394, + "num_tokens": 2537784.0, + "step": 99 + }, + { + "epoch": 0.010981770261366132, + "grad_norm": 6.574762344360352, + "learning_rate": 5.433589462129528e-08, + "loss": 1.5066, + "mean_token_accuracy": 0.6022794246673584, + "num_tokens": 2562433.0, + "step": 100 + }, + { + "epoch": 0.011091587963979794, + "grad_norm": 6.373205661773682, + "learning_rate": 5.4884742041712404e-08, + "loss": 1.452, + "mean_token_accuracy": 0.6240890622138977, + "num_tokens": 2586824.0, + "step": 101 + }, + { + "epoch": 0.011201405666593455, + "grad_norm": 6.654780387878418, + "learning_rate": 5.5433589462129526e-08, + "loss": 1.4758, + "mean_token_accuracy": 0.6141456365585327, + "num_tokens": 2612002.0, + "step": 102 + }, + { + "epoch": 0.011311223369207117, + "grad_norm": 6.927950859069824, + "learning_rate": 5.598243688254665e-08, + "loss": 1.5391, + "mean_token_accuracy": 0.6060784459114075, + "num_tokens": 2635858.0, + "step": 103 + }, + { + "epoch": 0.011421041071820778, + "grad_norm": 8.259126663208008, + "learning_rate": 5.653128430296377e-08, + "loss": 1.507, + "mean_token_accuracy": 0.6077144742012024, + "num_tokens": 2655009.0, + "step": 104 + }, + { + "epoch": 0.011530858774434438, + "grad_norm": 6.990094184875488, + "learning_rate": 5.70801317233809e-08, + "loss": 1.5251, + "mean_token_accuracy": 0.6034177541732788, + "num_tokens": 2679417.0, + "step": 105 + }, + { + "epoch": 0.0116406764770481, + "grad_norm": 6.627439022064209, + "learning_rate": 5.762897914379802e-08, + "loss": 1.3864, + "mean_token_accuracy": 0.6287237405776978, + "num_tokens": 2703775.0, + "step": 106 + }, + { + "epoch": 0.011750494179661761, + "grad_norm": 7.642573356628418, + "learning_rate": 5.817782656421515e-08, + "loss": 1.3793, + "mean_token_accuracy": 0.6353264451026917, + "num_tokens": 2723510.0, + "step": 107 + }, + { + "epoch": 0.011860311882275423, + "grad_norm": 8.127613067626953, + "learning_rate": 5.872667398463227e-08, + "loss": 1.4226, + "mean_token_accuracy": 0.6305581331253052, + "num_tokens": 2742273.0, + "step": 108 + }, + { + "epoch": 0.011970129584889085, + "grad_norm": 6.873010158538818, + "learning_rate": 5.927552140504939e-08, + "loss": 1.43, + "mean_token_accuracy": 0.6223410367965698, + "num_tokens": 2767241.0, + "step": 109 + }, + { + "epoch": 0.012079947287502746, + "grad_norm": 7.534580707550049, + "learning_rate": 5.982436882546651e-08, + "loss": 1.4832, + "mean_token_accuracy": 0.61568683385849, + "num_tokens": 2789346.0, + "step": 110 + }, + { + "epoch": 0.012189764990116406, + "grad_norm": 6.5029754638671875, + "learning_rate": 6.037321624588364e-08, + "loss": 1.5072, + "mean_token_accuracy": 0.6077187657356262, + "num_tokens": 2818231.0, + "step": 111 + }, + { + "epoch": 0.012299582692730068, + "grad_norm": 6.420916557312012, + "learning_rate": 6.092206366630077e-08, + "loss": 1.4379, + "mean_token_accuracy": 0.6148214340209961, + "num_tokens": 2845644.0, + "step": 112 + }, + { + "epoch": 0.01240940039534373, + "grad_norm": 6.959995269775391, + "learning_rate": 6.147091108671789e-08, + "loss": 1.4099, + "mean_token_accuracy": 0.6223923563957214, + "num_tokens": 2867271.0, + "step": 113 + }, + { + "epoch": 0.01251921809795739, + "grad_norm": 6.979227542877197, + "learning_rate": 6.201975850713501e-08, + "loss": 1.3676, + "mean_token_accuracy": 0.6377322673797607, + "num_tokens": 2890936.0, + "step": 114 + }, + { + "epoch": 0.012629035800571052, + "grad_norm": 6.199805736541748, + "learning_rate": 6.256860592755214e-08, + "loss": 1.415, + "mean_token_accuracy": 0.624497652053833, + "num_tokens": 2916791.0, + "step": 115 + }, + { + "epoch": 0.012738853503184714, + "grad_norm": 6.035910129547119, + "learning_rate": 6.311745334796927e-08, + "loss": 1.3772, + "mean_token_accuracy": 0.6290345191955566, + "num_tokens": 2945597.0, + "step": 116 + }, + { + "epoch": 0.012848671205798374, + "grad_norm": 7.506038665771484, + "learning_rate": 6.366630076838639e-08, + "loss": 1.4273, + "mean_token_accuracy": 0.6168444156646729, + "num_tokens": 2967299.0, + "step": 117 + }, + { + "epoch": 0.012958488908412035, + "grad_norm": 6.814609527587891, + "learning_rate": 6.421514818880352e-08, + "loss": 1.4521, + "mean_token_accuracy": 0.6105451583862305, + "num_tokens": 2991365.0, + "step": 118 + }, + { + "epoch": 0.013068306611025697, + "grad_norm": 7.0744709968566895, + "learning_rate": 6.476399560922063e-08, + "loss": 1.4096, + "mean_token_accuracy": 0.6299569606781006, + "num_tokens": 3014099.0, + "step": 119 + }, + { + "epoch": 0.013178124313639359, + "grad_norm": 6.2215118408203125, + "learning_rate": 6.531284302963776e-08, + "loss": 1.4535, + "mean_token_accuracy": 0.6123992204666138, + "num_tokens": 3042677.0, + "step": 120 + }, + { + "epoch": 0.01328794201625302, + "grad_norm": 5.484449863433838, + "learning_rate": 6.586169045005487e-08, + "loss": 1.2646, + "mean_token_accuracy": 0.6550175547599792, + "num_tokens": 3070063.0, + "step": 121 + }, + { + "epoch": 0.013397759718866682, + "grad_norm": 5.59461784362793, + "learning_rate": 6.6410537870472e-08, + "loss": 1.4872, + "mean_token_accuracy": 0.6053921580314636, + "num_tokens": 3100157.0, + "step": 122 + }, + { + "epoch": 0.013507577421480343, + "grad_norm": 5.966394424438477, + "learning_rate": 6.695938529088913e-08, + "loss": 1.3921, + "mean_token_accuracy": 0.6278308629989624, + "num_tokens": 3125104.0, + "step": 123 + }, + { + "epoch": 0.013617395124094003, + "grad_norm": 6.0773162841796875, + "learning_rate": 6.750823271130625e-08, + "loss": 1.3772, + "mean_token_accuracy": 0.6298218369483948, + "num_tokens": 3148821.0, + "step": 124 + }, + { + "epoch": 0.013727212826707665, + "grad_norm": 5.446869850158691, + "learning_rate": 6.805708013172338e-08, + "loss": 1.4078, + "mean_token_accuracy": 0.6161810159683228, + "num_tokens": 3177268.0, + "step": 125 + }, + { + "epoch": 0.013837030529321327, + "grad_norm": 5.212534427642822, + "learning_rate": 6.86059275521405e-08, + "loss": 1.3587, + "mean_token_accuracy": 0.6292580366134644, + "num_tokens": 3204336.0, + "step": 126 + }, + { + "epoch": 0.013946848231934988, + "grad_norm": 5.732970237731934, + "learning_rate": 6.915477497255763e-08, + "loss": 1.3119, + "mean_token_accuracy": 0.6473299264907837, + "num_tokens": 3226462.0, + "step": 127 + }, + { + "epoch": 0.01405666593454865, + "grad_norm": 5.542710304260254, + "learning_rate": 6.970362239297475e-08, + "loss": 1.3927, + "mean_token_accuracy": 0.6285200119018555, + "num_tokens": 3249900.0, + "step": 128 + }, + { + "epoch": 0.014166483637162311, + "grad_norm": 5.588710308074951, + "learning_rate": 7.025246981339188e-08, + "loss": 1.4624, + "mean_token_accuracy": 0.6122121214866638, + "num_tokens": 3274345.0, + "step": 129 + }, + { + "epoch": 0.014276301339775971, + "grad_norm": 5.375238418579102, + "learning_rate": 7.0801317233809e-08, + "loss": 1.3395, + "mean_token_accuracy": 0.6463728547096252, + "num_tokens": 3300749.0, + "step": 130 + }, + { + "epoch": 0.014386119042389633, + "grad_norm": 4.769513130187988, + "learning_rate": 7.135016465422612e-08, + "loss": 1.3962, + "mean_token_accuracy": 0.6281101703643799, + "num_tokens": 3332077.0, + "step": 131 + }, + { + "epoch": 0.014495936745003294, + "grad_norm": 5.29296350479126, + "learning_rate": 7.189901207464325e-08, + "loss": 1.465, + "mean_token_accuracy": 0.60999596118927, + "num_tokens": 3358665.0, + "step": 132 + }, + { + "epoch": 0.014605754447616956, + "grad_norm": 5.23523473739624, + "learning_rate": 7.244785949506036e-08, + "loss": 1.4261, + "mean_token_accuracy": 0.6075029373168945, + "num_tokens": 3382886.0, + "step": 133 + }, + { + "epoch": 0.014715572150230618, + "grad_norm": 4.549793243408203, + "learning_rate": 7.299670691547749e-08, + "loss": 1.4288, + "mean_token_accuracy": 0.6165145039558411, + "num_tokens": 3416614.0, + "step": 134 + }, + { + "epoch": 0.01482538985284428, + "grad_norm": 5.058295726776123, + "learning_rate": 7.354555433589462e-08, + "loss": 1.3733, + "mean_token_accuracy": 0.6256067752838135, + "num_tokens": 3443253.0, + "step": 135 + }, + { + "epoch": 0.014935207555457939, + "grad_norm": 5.6178131103515625, + "learning_rate": 7.409440175631174e-08, + "loss": 1.4258, + "mean_token_accuracy": 0.609893798828125, + "num_tokens": 3466396.0, + "step": 136 + }, + { + "epoch": 0.0150450252580716, + "grad_norm": 5.403201103210449, + "learning_rate": 7.464324917672886e-08, + "loss": 1.4431, + "mean_token_accuracy": 0.6119782328605652, + "num_tokens": 3490297.0, + "step": 137 + }, + { + "epoch": 0.015154842960685262, + "grad_norm": 5.094993591308594, + "learning_rate": 7.519209659714599e-08, + "loss": 1.4167, + "mean_token_accuracy": 0.6101880073547363, + "num_tokens": 3517790.0, + "step": 138 + }, + { + "epoch": 0.015264660663298924, + "grad_norm": 4.587831974029541, + "learning_rate": 7.574094401756312e-08, + "loss": 1.4196, + "mean_token_accuracy": 0.621229887008667, + "num_tokens": 3548138.0, + "step": 139 + }, + { + "epoch": 0.015374478365912585, + "grad_norm": 4.928002834320068, + "learning_rate": 7.628979143798024e-08, + "loss": 1.2899, + "mean_token_accuracy": 0.6450660228729248, + "num_tokens": 3571002.0, + "step": 140 + }, + { + "epoch": 0.015484296068526247, + "grad_norm": 5.296974182128906, + "learning_rate": 7.683863885839736e-08, + "loss": 1.3171, + "mean_token_accuracy": 0.6363016366958618, + "num_tokens": 3593458.0, + "step": 141 + }, + { + "epoch": 0.015594113771139909, + "grad_norm": 5.3306779861450195, + "learning_rate": 7.738748627881449e-08, + "loss": 1.3282, + "mean_token_accuracy": 0.6312785744667053, + "num_tokens": 3616376.0, + "step": 142 + }, + { + "epoch": 0.01570393147375357, + "grad_norm": 5.090769290924072, + "learning_rate": 7.793633369923161e-08, + "loss": 1.3472, + "mean_token_accuracy": 0.6470484733581543, + "num_tokens": 3640127.0, + "step": 143 + }, + { + "epoch": 0.015813749176367232, + "grad_norm": 5.093803882598877, + "learning_rate": 7.848518111964874e-08, + "loss": 1.3019, + "mean_token_accuracy": 0.6353912353515625, + "num_tokens": 3663634.0, + "step": 144 + }, + { + "epoch": 0.01592356687898089, + "grad_norm": 5.118440628051758, + "learning_rate": 7.903402854006585e-08, + "loss": 1.4034, + "mean_token_accuracy": 0.6208579540252686, + "num_tokens": 3687300.0, + "step": 145 + }, + { + "epoch": 0.01603338458159455, + "grad_norm": 5.2011332511901855, + "learning_rate": 7.958287596048298e-08, + "loss": 1.4243, + "mean_token_accuracy": 0.6092373728752136, + "num_tokens": 3708545.0, + "step": 146 + }, + { + "epoch": 0.016143202284208215, + "grad_norm": 5.246453285217285, + "learning_rate": 8.01317233809001e-08, + "loss": 1.3225, + "mean_token_accuracy": 0.6434162855148315, + "num_tokens": 3730029.0, + "step": 147 + }, + { + "epoch": 0.016253019986821875, + "grad_norm": 4.223818302154541, + "learning_rate": 8.068057080131722e-08, + "loss": 1.3592, + "mean_token_accuracy": 0.6255754232406616, + "num_tokens": 3761444.0, + "step": 148 + }, + { + "epoch": 0.016362837689435538, + "grad_norm": 5.277386665344238, + "learning_rate": 8.122941822173437e-08, + "loss": 1.3414, + "mean_token_accuracy": 0.6382550001144409, + "num_tokens": 3783655.0, + "step": 149 + }, + { + "epoch": 0.016472655392049198, + "grad_norm": 4.4582085609436035, + "learning_rate": 8.177826564215148e-08, + "loss": 1.3654, + "mean_token_accuracy": 0.6329643726348877, + "num_tokens": 3811043.0, + "step": 150 + }, + { + "epoch": 0.01658247309466286, + "grad_norm": 4.587891101837158, + "learning_rate": 8.232711306256861e-08, + "loss": 1.4307, + "mean_token_accuracy": 0.6065413355827332, + "num_tokens": 3839783.0, + "step": 151 + }, + { + "epoch": 0.01669229079727652, + "grad_norm": 4.9474287033081055, + "learning_rate": 8.287596048298572e-08, + "loss": 1.2886, + "mean_token_accuracy": 0.6488010287284851, + "num_tokens": 3864121.0, + "step": 152 + }, + { + "epoch": 0.01680210849989018, + "grad_norm": 4.90735387802124, + "learning_rate": 8.342480790340285e-08, + "loss": 1.3437, + "mean_token_accuracy": 0.6383364200592041, + "num_tokens": 3888944.0, + "step": 153 + }, + { + "epoch": 0.016911926202503844, + "grad_norm": 4.498086929321289, + "learning_rate": 8.397365532381998e-08, + "loss": 1.3541, + "mean_token_accuracy": 0.6318585872650146, + "num_tokens": 3916270.0, + "step": 154 + }, + { + "epoch": 0.017021743905117504, + "grad_norm": 4.161583423614502, + "learning_rate": 8.45225027442371e-08, + "loss": 1.3972, + "mean_token_accuracy": 0.6197839379310608, + "num_tokens": 3947472.0, + "step": 155 + }, + { + "epoch": 0.017131561607731167, + "grad_norm": 4.425992488861084, + "learning_rate": 8.507135016465423e-08, + "loss": 1.3002, + "mean_token_accuracy": 0.6422058343887329, + "num_tokens": 3972559.0, + "step": 156 + }, + { + "epoch": 0.017241379310344827, + "grad_norm": 4.7410688400268555, + "learning_rate": 8.562019758507134e-08, + "loss": 1.3904, + "mean_token_accuracy": 0.6258483529090881, + "num_tokens": 4000358.0, + "step": 157 + }, + { + "epoch": 0.017351197012958487, + "grad_norm": 4.5831098556518555, + "learning_rate": 8.616904500548847e-08, + "loss": 1.3112, + "mean_token_accuracy": 0.6385015249252319, + "num_tokens": 4024066.0, + "step": 158 + }, + { + "epoch": 0.01746101471557215, + "grad_norm": 4.682743072509766, + "learning_rate": 8.671789242590558e-08, + "loss": 1.3802, + "mean_token_accuracy": 0.6234796047210693, + "num_tokens": 4049046.0, + "step": 159 + }, + { + "epoch": 0.01757083241818581, + "grad_norm": 5.081345081329346, + "learning_rate": 8.726673984632271e-08, + "loss": 1.4051, + "mean_token_accuracy": 0.6202201843261719, + "num_tokens": 4072253.0, + "step": 160 + }, + { + "epoch": 0.017680650120799474, + "grad_norm": 4.183385848999023, + "learning_rate": 8.781558726673985e-08, + "loss": 1.2926, + "mean_token_accuracy": 0.6491385102272034, + "num_tokens": 4100888.0, + "step": 161 + }, + { + "epoch": 0.017790467823413134, + "grad_norm": 4.263305187225342, + "learning_rate": 8.836443468715697e-08, + "loss": 1.3689, + "mean_token_accuracy": 0.628709077835083, + "num_tokens": 4127399.0, + "step": 162 + }, + { + "epoch": 0.017900285526026797, + "grad_norm": 4.526121139526367, + "learning_rate": 8.89132821075741e-08, + "loss": 1.3748, + "mean_token_accuracy": 0.6218031048774719, + "num_tokens": 4151932.0, + "step": 163 + }, + { + "epoch": 0.018010103228640457, + "grad_norm": 4.412806987762451, + "learning_rate": 8.946212952799121e-08, + "loss": 1.3171, + "mean_token_accuracy": 0.6491904258728027, + "num_tokens": 4178976.0, + "step": 164 + }, + { + "epoch": 0.018119920931254117, + "grad_norm": 4.154404163360596, + "learning_rate": 9.001097694840834e-08, + "loss": 1.3219, + "mean_token_accuracy": 0.6406502723693848, + "num_tokens": 4208566.0, + "step": 165 + }, + { + "epoch": 0.01822973863386778, + "grad_norm": 4.071923732757568, + "learning_rate": 9.055982436882546e-08, + "loss": 1.4124, + "mean_token_accuracy": 0.6164237856864929, + "num_tokens": 4238340.0, + "step": 166 + }, + { + "epoch": 0.01833955633648144, + "grad_norm": 4.428764343261719, + "learning_rate": 9.110867178924259e-08, + "loss": 1.4262, + "mean_token_accuracy": 0.6126191020011902, + "num_tokens": 4266290.0, + "step": 167 + }, + { + "epoch": 0.018449374039095103, + "grad_norm": 5.466249942779541, + "learning_rate": 9.165751920965971e-08, + "loss": 1.281, + "mean_token_accuracy": 0.6397951245307922, + "num_tokens": 4286088.0, + "step": 168 + }, + { + "epoch": 0.018559191741708763, + "grad_norm": 4.681741237640381, + "learning_rate": 9.220636663007683e-08, + "loss": 1.3498, + "mean_token_accuracy": 0.6268871426582336, + "num_tokens": 4308325.0, + "step": 169 + }, + { + "epoch": 0.018669009444322426, + "grad_norm": 5.455130100250244, + "learning_rate": 9.275521405049396e-08, + "loss": 1.2415, + "mean_token_accuracy": 0.6571863293647766, + "num_tokens": 4325903.0, + "step": 170 + }, + { + "epoch": 0.018778827146936086, + "grad_norm": 5.674758434295654, + "learning_rate": 9.330406147091107e-08, + "loss": 1.3344, + "mean_token_accuracy": 0.6356186866760254, + "num_tokens": 4344609.0, + "step": 171 + }, + { + "epoch": 0.018888644849549746, + "grad_norm": 4.79982852935791, + "learning_rate": 9.38529088913282e-08, + "loss": 1.3726, + "mean_token_accuracy": 0.6185266971588135, + "num_tokens": 4373666.0, + "step": 172 + }, + { + "epoch": 0.01899846255216341, + "grad_norm": 4.466107368469238, + "learning_rate": 9.440175631174534e-08, + "loss": 1.3673, + "mean_token_accuracy": 0.624434769153595, + "num_tokens": 4398895.0, + "step": 173 + }, + { + "epoch": 0.01910828025477707, + "grad_norm": 4.103312969207764, + "learning_rate": 9.495060373216246e-08, + "loss": 1.2315, + "mean_token_accuracy": 0.6536507606506348, + "num_tokens": 4424647.0, + "step": 174 + }, + { + "epoch": 0.019218097957390733, + "grad_norm": 4.696688652038574, + "learning_rate": 9.549945115257959e-08, + "loss": 1.304, + "mean_token_accuracy": 0.6435081958770752, + "num_tokens": 4444908.0, + "step": 175 + }, + { + "epoch": 0.019327915660004392, + "grad_norm": 4.111949443817139, + "learning_rate": 9.60482985729967e-08, + "loss": 1.3357, + "mean_token_accuracy": 0.6369770765304565, + "num_tokens": 4474119.0, + "step": 176 + }, + { + "epoch": 0.019437733362618052, + "grad_norm": 4.579794883728027, + "learning_rate": 9.659714599341383e-08, + "loss": 1.3313, + "mean_token_accuracy": 0.6362183094024658, + "num_tokens": 4497344.0, + "step": 177 + }, + { + "epoch": 0.019547551065231716, + "grad_norm": 3.853769540786743, + "learning_rate": 9.714599341383095e-08, + "loss": 1.3569, + "mean_token_accuracy": 0.6280947923660278, + "num_tokens": 4526184.0, + "step": 178 + }, + { + "epoch": 0.019657368767845376, + "grad_norm": 3.8777501583099365, + "learning_rate": 9.769484083424807e-08, + "loss": 1.2964, + "mean_token_accuracy": 0.6459435224533081, + "num_tokens": 4554952.0, + "step": 179 + }, + { + "epoch": 0.01976718647045904, + "grad_norm": 4.469107151031494, + "learning_rate": 9.82436882546652e-08, + "loss": 1.3597, + "mean_token_accuracy": 0.6248977184295654, + "num_tokens": 4577398.0, + "step": 180 + }, + { + "epoch": 0.0198770041730727, + "grad_norm": 3.528329610824585, + "learning_rate": 9.879253567508232e-08, + "loss": 1.296, + "mean_token_accuracy": 0.6485646963119507, + "num_tokens": 4608129.0, + "step": 181 + }, + { + "epoch": 0.019986821875686362, + "grad_norm": 3.927687644958496, + "learning_rate": 9.934138309549945e-08, + "loss": 1.2858, + "mean_token_accuracy": 0.6433649063110352, + "num_tokens": 4639069.0, + "step": 182 + }, + { + "epoch": 0.020096639578300022, + "grad_norm": 4.007258892059326, + "learning_rate": 9.989023051591656e-08, + "loss": 1.317, + "mean_token_accuracy": 0.6340017914772034, + "num_tokens": 4668648.0, + "step": 183 + }, + { + "epoch": 0.020206457280913682, + "grad_norm": 3.756974697113037, + "learning_rate": 1.0043907793633369e-07, + "loss": 1.3249, + "mean_token_accuracy": 0.6357885599136353, + "num_tokens": 4699445.0, + "step": 184 + }, + { + "epoch": 0.020316274983527345, + "grad_norm": 4.286639213562012, + "learning_rate": 1.0098792535675082e-07, + "loss": 1.2932, + "mean_token_accuracy": 0.6461837291717529, + "num_tokens": 4721846.0, + "step": 185 + }, + { + "epoch": 0.020426092686141005, + "grad_norm": 6.0904622077941895, + "learning_rate": 1.0153677277716795e-07, + "loss": 1.2952, + "mean_token_accuracy": 0.646186351776123, + "num_tokens": 4738996.0, + "step": 186 + }, + { + "epoch": 0.02053591038875467, + "grad_norm": 4.076390743255615, + "learning_rate": 1.0208562019758508e-07, + "loss": 1.297, + "mean_token_accuracy": 0.6428448557853699, + "num_tokens": 4763252.0, + "step": 187 + }, + { + "epoch": 0.020645728091368328, + "grad_norm": 4.410069942474365, + "learning_rate": 1.0263446761800219e-07, + "loss": 1.3143, + "mean_token_accuracy": 0.6392781734466553, + "num_tokens": 4787704.0, + "step": 188 + }, + { + "epoch": 0.02075554579398199, + "grad_norm": 3.933361530303955, + "learning_rate": 1.0318331503841932e-07, + "loss": 1.3072, + "mean_token_accuracy": 0.640906810760498, + "num_tokens": 4813901.0, + "step": 189 + }, + { + "epoch": 0.02086536349659565, + "grad_norm": 5.353473663330078, + "learning_rate": 1.0373216245883643e-07, + "loss": 1.4534, + "mean_token_accuracy": 0.6222183704376221, + "num_tokens": 4836090.0, + "step": 190 + }, + { + "epoch": 0.02097518119920931, + "grad_norm": 3.790947914123535, + "learning_rate": 1.0428100987925356e-07, + "loss": 1.3223, + "mean_token_accuracy": 0.6328597068786621, + "num_tokens": 4865918.0, + "step": 191 + }, + { + "epoch": 0.021084998901822975, + "grad_norm": 4.251343727111816, + "learning_rate": 1.0482985729967068e-07, + "loss": 1.3318, + "mean_token_accuracy": 0.6315561532974243, + "num_tokens": 4894228.0, + "step": 192 + }, + { + "epoch": 0.021194816604436634, + "grad_norm": 3.5747430324554443, + "learning_rate": 1.0537870472008781e-07, + "loss": 1.346, + "mean_token_accuracy": 0.6313683390617371, + "num_tokens": 4926540.0, + "step": 193 + }, + { + "epoch": 0.021304634307050298, + "grad_norm": 4.349277019500732, + "learning_rate": 1.0592755214050494e-07, + "loss": 1.4158, + "mean_token_accuracy": 0.6144803166389465, + "num_tokens": 4949950.0, + "step": 194 + }, + { + "epoch": 0.021414452009663958, + "grad_norm": 4.416962623596191, + "learning_rate": 1.0647639956092205e-07, + "loss": 1.2823, + "mean_token_accuracy": 0.6452149748802185, + "num_tokens": 4971681.0, + "step": 195 + }, + { + "epoch": 0.021524269712277617, + "grad_norm": 4.398171424865723, + "learning_rate": 1.0702524698133918e-07, + "loss": 1.3097, + "mean_token_accuracy": 0.643372654914856, + "num_tokens": 4994650.0, + "step": 196 + }, + { + "epoch": 0.02163408741489128, + "grad_norm": 4.2732086181640625, + "learning_rate": 1.0757409440175631e-07, + "loss": 1.2869, + "mean_token_accuracy": 0.6425480842590332, + "num_tokens": 5021958.0, + "step": 197 + }, + { + "epoch": 0.02174390511750494, + "grad_norm": 4.680013179779053, + "learning_rate": 1.0812294182217344e-07, + "loss": 1.2919, + "mean_token_accuracy": 0.6426190733909607, + "num_tokens": 5046314.0, + "step": 198 + }, + { + "epoch": 0.021853722820118604, + "grad_norm": 5.971451759338379, + "learning_rate": 1.0867178924259056e-07, + "loss": 1.2193, + "mean_token_accuracy": 0.6533739566802979, + "num_tokens": 5062927.0, + "step": 199 + }, + { + "epoch": 0.021963540522732264, + "grad_norm": 4.078697681427002, + "learning_rate": 1.0922063666300768e-07, + "loss": 1.3394, + "mean_token_accuracy": 0.6266604661941528, + "num_tokens": 5090146.0, + "step": 200 + }, + { + "epoch": 0.022073358225345927, + "grad_norm": 4.611117839813232, + "learning_rate": 1.0976948408342481e-07, + "loss": 1.3293, + "mean_token_accuracy": 0.626224935054779, + "num_tokens": 5116632.0, + "step": 201 + }, + { + "epoch": 0.022183175927959587, + "grad_norm": 4.608649730682373, + "learning_rate": 1.1031833150384192e-07, + "loss": 1.3734, + "mean_token_accuracy": 0.617669403553009, + "num_tokens": 5146035.0, + "step": 202 + }, + { + "epoch": 0.022292993630573247, + "grad_norm": 4.011085510253906, + "learning_rate": 1.1086717892425905e-07, + "loss": 1.1461, + "mean_token_accuracy": 0.6687418222427368, + "num_tokens": 5170311.0, + "step": 203 + }, + { + "epoch": 0.02240281133318691, + "grad_norm": 5.001126766204834, + "learning_rate": 1.1141602634467617e-07, + "loss": 1.2841, + "mean_token_accuracy": 0.6411000490188599, + "num_tokens": 5190705.0, + "step": 204 + }, + { + "epoch": 0.02251262903580057, + "grad_norm": 4.629586219787598, + "learning_rate": 1.119648737650933e-07, + "loss": 1.3336, + "mean_token_accuracy": 0.6360781788825989, + "num_tokens": 5218303.0, + "step": 205 + }, + { + "epoch": 0.022622446738414233, + "grad_norm": 4.58708381652832, + "learning_rate": 1.1251372118551042e-07, + "loss": 1.3104, + "mean_token_accuracy": 0.6341338753700256, + "num_tokens": 5245235.0, + "step": 206 + }, + { + "epoch": 0.022732264441027893, + "grad_norm": 4.84835958480835, + "learning_rate": 1.1306256860592754e-07, + "loss": 1.3264, + "mean_token_accuracy": 0.636208176612854, + "num_tokens": 5271191.0, + "step": 207 + }, + { + "epoch": 0.022842082143641557, + "grad_norm": 4.2012763023376465, + "learning_rate": 1.1361141602634467e-07, + "loss": 1.3205, + "mean_token_accuracy": 0.6368701457977295, + "num_tokens": 5298152.0, + "step": 208 + }, + { + "epoch": 0.022951899846255217, + "grad_norm": 4.3350934982299805, + "learning_rate": 1.141602634467618e-07, + "loss": 1.3146, + "mean_token_accuracy": 0.6375466585159302, + "num_tokens": 5326400.0, + "step": 209 + }, + { + "epoch": 0.023061717548868876, + "grad_norm": 4.3891801834106445, + "learning_rate": 1.1470911086717892e-07, + "loss": 1.3739, + "mean_token_accuracy": 0.627068817615509, + "num_tokens": 5357353.0, + "step": 210 + }, + { + "epoch": 0.02317153525148254, + "grad_norm": 5.801607608795166, + "learning_rate": 1.1525795828759604e-07, + "loss": 1.2377, + "mean_token_accuracy": 0.6535273194313049, + "num_tokens": 5383563.0, + "step": 211 + }, + { + "epoch": 0.0232813529540962, + "grad_norm": 4.689700603485107, + "learning_rate": 1.1580680570801317e-07, + "loss": 1.3564, + "mean_token_accuracy": 0.6236058473587036, + "num_tokens": 5414546.0, + "step": 212 + }, + { + "epoch": 0.023391170656709863, + "grad_norm": 5.463894844055176, + "learning_rate": 1.163556531284303e-07, + "loss": 1.3578, + "mean_token_accuracy": 0.6378907561302185, + "num_tokens": 5442533.0, + "step": 213 + }, + { + "epoch": 0.023500988359323523, + "grad_norm": 8.244662284851074, + "learning_rate": 1.1690450054884741e-07, + "loss": 1.2438, + "mean_token_accuracy": 0.6541065573692322, + "num_tokens": 5464737.0, + "step": 214 + }, + { + "epoch": 0.023610806061937183, + "grad_norm": 5.0995354652404785, + "learning_rate": 1.1745334796926454e-07, + "loss": 1.2844, + "mean_token_accuracy": 0.6448526382446289, + "num_tokens": 5495407.0, + "step": 215 + }, + { + "epoch": 0.023720623764550846, + "grad_norm": 6.098871231079102, + "learning_rate": 1.1800219538968166e-07, + "loss": 1.2321, + "mean_token_accuracy": 0.653816819190979, + "num_tokens": 5515250.0, + "step": 216 + }, + { + "epoch": 0.023830441467164506, + "grad_norm": 5.456662178039551, + "learning_rate": 1.1855104281009878e-07, + "loss": 1.3502, + "mean_token_accuracy": 0.6316908597946167, + "num_tokens": 5547103.0, + "step": 217 + }, + { + "epoch": 0.02394025916977817, + "grad_norm": 4.221183776855469, + "learning_rate": 1.1909989023051591e-07, + "loss": 1.3026, + "mean_token_accuracy": 0.6436402201652527, + "num_tokens": 5576176.0, + "step": 218 + }, + { + "epoch": 0.02405007687239183, + "grad_norm": 5.538569927215576, + "learning_rate": 1.1964873765093303e-07, + "loss": 1.2864, + "mean_token_accuracy": 0.6429834365844727, + "num_tokens": 5604152.0, + "step": 219 + }, + { + "epoch": 0.024159894575005492, + "grad_norm": 6.504305839538574, + "learning_rate": 1.2019758507135017e-07, + "loss": 1.2752, + "mean_token_accuracy": 0.6440222263336182, + "num_tokens": 5624982.0, + "step": 220 + }, + { + "epoch": 0.024269712277619152, + "grad_norm": 4.656147003173828, + "learning_rate": 1.2074643249176729e-07, + "loss": 1.2476, + "mean_token_accuracy": 0.6536533832550049, + "num_tokens": 5656361.0, + "step": 221 + }, + { + "epoch": 0.024379529980232812, + "grad_norm": 5.42180871963501, + "learning_rate": 1.212952799121844e-07, + "loss": 1.2897, + "mean_token_accuracy": 0.6493082046508789, + "num_tokens": 5681233.0, + "step": 222 + }, + { + "epoch": 0.024489347682846475, + "grad_norm": 7.3041815757751465, + "learning_rate": 1.2184412733260154e-07, + "loss": 1.2336, + "mean_token_accuracy": 0.660628080368042, + "num_tokens": 5700540.0, + "step": 223 + }, + { + "epoch": 0.024599165385460135, + "grad_norm": 5.866429805755615, + "learning_rate": 1.2239297475301866e-07, + "loss": 1.16, + "mean_token_accuracy": 0.6649694442749023, + "num_tokens": 5722400.0, + "step": 224 + }, + { + "epoch": 0.0247089830880738, + "grad_norm": 5.752702236175537, + "learning_rate": 1.2294182217343577e-07, + "loss": 1.2453, + "mean_token_accuracy": 0.6442146301269531, + "num_tokens": 5741612.0, + "step": 225 + }, + { + "epoch": 0.02481880079068746, + "grad_norm": 6.02902364730835, + "learning_rate": 1.2349066959385291e-07, + "loss": 1.2333, + "mean_token_accuracy": 0.6521704196929932, + "num_tokens": 5769437.0, + "step": 226 + }, + { + "epoch": 0.024928618493301122, + "grad_norm": 4.73038911819458, + "learning_rate": 1.2403951701427003e-07, + "loss": 1.3162, + "mean_token_accuracy": 0.6207985281944275, + "num_tokens": 5797967.0, + "step": 227 + }, + { + "epoch": 0.02503843619591478, + "grad_norm": 6.379345417022705, + "learning_rate": 1.2458836443468714e-07, + "loss": 1.1763, + "mean_token_accuracy": 0.6624815464019775, + "num_tokens": 5821014.0, + "step": 228 + }, + { + "epoch": 0.02514825389852844, + "grad_norm": 5.088677883148193, + "learning_rate": 1.2513721185510429e-07, + "loss": 1.1751, + "mean_token_accuracy": 0.6716926097869873, + "num_tokens": 5847515.0, + "step": 229 + }, + { + "epoch": 0.025258071601142105, + "grad_norm": 4.453393936157227, + "learning_rate": 1.256860592755214e-07, + "loss": 1.2175, + "mean_token_accuracy": 0.6624367237091064, + "num_tokens": 5876169.0, + "step": 230 + }, + { + "epoch": 0.025367889303755765, + "grad_norm": 5.027719974517822, + "learning_rate": 1.2623490669593854e-07, + "loss": 1.2585, + "mean_token_accuracy": 0.6512571573257446, + "num_tokens": 5905091.0, + "step": 231 + }, + { + "epoch": 0.025477707006369428, + "grad_norm": 6.3550028800964355, + "learning_rate": 1.2678375411635563e-07, + "loss": 1.3385, + "mean_token_accuracy": 0.6424291133880615, + "num_tokens": 5926918.0, + "step": 232 + }, + { + "epoch": 0.025587524708983088, + "grad_norm": 5.946305751800537, + "learning_rate": 1.2733260153677277e-07, + "loss": 1.3077, + "mean_token_accuracy": 0.6305732727050781, + "num_tokens": 5948931.0, + "step": 233 + }, + { + "epoch": 0.025697342411596748, + "grad_norm": 5.355642795562744, + "learning_rate": 1.278814489571899e-07, + "loss": 1.2955, + "mean_token_accuracy": 0.6359736919403076, + "num_tokens": 5971928.0, + "step": 234 + }, + { + "epoch": 0.02580716011421041, + "grad_norm": 4.897372245788574, + "learning_rate": 1.2843029637760703e-07, + "loss": 1.1811, + "mean_token_accuracy": 0.6778658628463745, + "num_tokens": 6001957.0, + "step": 235 + }, + { + "epoch": 0.02591697781682407, + "grad_norm": 6.726655006408691, + "learning_rate": 1.2897914379802412e-07, + "loss": 1.2405, + "mean_token_accuracy": 0.6502217650413513, + "num_tokens": 6027760.0, + "step": 236 + }, + { + "epoch": 0.026026795519437734, + "grad_norm": 5.354612827301025, + "learning_rate": 1.2952799121844126e-07, + "loss": 1.1482, + "mean_token_accuracy": 0.6771365404129028, + "num_tokens": 6053396.0, + "step": 237 + }, + { + "epoch": 0.026136613222051394, + "grad_norm": 5.776284694671631, + "learning_rate": 1.300768386388584e-07, + "loss": 1.1999, + "mean_token_accuracy": 0.6642111539840698, + "num_tokens": 6078885.0, + "step": 238 + }, + { + "epoch": 0.026246430924665057, + "grad_norm": 6.204336643218994, + "learning_rate": 1.3062568605927552e-07, + "loss": 1.2759, + "mean_token_accuracy": 0.6445035934448242, + "num_tokens": 6105632.0, + "step": 239 + }, + { + "epoch": 0.026356248627278717, + "grad_norm": 6.279324054718018, + "learning_rate": 1.3117453347969266e-07, + "loss": 1.2325, + "mean_token_accuracy": 0.653620719909668, + "num_tokens": 6127652.0, + "step": 240 + }, + { + "epoch": 0.026466066329892377, + "grad_norm": 6.521228790283203, + "learning_rate": 1.3172338090010975e-07, + "loss": 1.2238, + "mean_token_accuracy": 0.6597338914871216, + "num_tokens": 6148758.0, + "step": 241 + }, + { + "epoch": 0.02657588403250604, + "grad_norm": 6.01894474029541, + "learning_rate": 1.322722283205269e-07, + "loss": 1.324, + "mean_token_accuracy": 0.6281133890151978, + "num_tokens": 6174511.0, + "step": 242 + }, + { + "epoch": 0.0266857017351197, + "grad_norm": 4.339667797088623, + "learning_rate": 1.32821075740944e-07, + "loss": 1.2476, + "mean_token_accuracy": 0.6549266576766968, + "num_tokens": 6203700.0, + "step": 243 + }, + { + "epoch": 0.026795519437733364, + "grad_norm": 5.315229892730713, + "learning_rate": 1.3336992316136115e-07, + "loss": 1.3828, + "mean_token_accuracy": 0.6139639616012573, + "num_tokens": 6230034.0, + "step": 244 + }, + { + "epoch": 0.026905337140347024, + "grad_norm": 5.0578155517578125, + "learning_rate": 1.3391877058177826e-07, + "loss": 1.2072, + "mean_token_accuracy": 0.6801385283470154, + "num_tokens": 6254482.0, + "step": 245 + }, + { + "epoch": 0.027015154842960687, + "grad_norm": 5.518126964569092, + "learning_rate": 1.3446761800219538e-07, + "loss": 1.3411, + "mean_token_accuracy": 0.6340261697769165, + "num_tokens": 6279125.0, + "step": 246 + }, + { + "epoch": 0.027124972545574347, + "grad_norm": 5.819349765777588, + "learning_rate": 1.350164654226125e-07, + "loss": 1.2306, + "mean_token_accuracy": 0.6499419212341309, + "num_tokens": 6304000.0, + "step": 247 + }, + { + "epoch": 0.027234790248188007, + "grad_norm": 7.114381790161133, + "learning_rate": 1.3556531284302963e-07, + "loss": 1.1969, + "mean_token_accuracy": 0.6600766181945801, + "num_tokens": 6324786.0, + "step": 248 + }, + { + "epoch": 0.02734460795080167, + "grad_norm": 4.990294456481934, + "learning_rate": 1.3611416026344675e-07, + "loss": 1.2069, + "mean_token_accuracy": 0.6590170860290527, + "num_tokens": 6345725.0, + "step": 249 + }, + { + "epoch": 0.02745442565341533, + "grad_norm": 5.342772483825684, + "learning_rate": 1.366630076838639e-07, + "loss": 1.3009, + "mean_token_accuracy": 0.6409324407577515, + "num_tokens": 6373952.0, + "step": 250 + }, + { + "epoch": 0.027564243356028993, + "grad_norm": 4.895172595977783, + "learning_rate": 1.37211855104281e-07, + "loss": 1.2007, + "mean_token_accuracy": 0.6556930541992188, + "num_tokens": 6397657.0, + "step": 251 + }, + { + "epoch": 0.027674061058642653, + "grad_norm": 5.3326592445373535, + "learning_rate": 1.3776070252469812e-07, + "loss": 1.2164, + "mean_token_accuracy": 0.6496132612228394, + "num_tokens": 6421074.0, + "step": 252 + }, + { + "epoch": 0.027783878761256313, + "grad_norm": 5.2743988037109375, + "learning_rate": 1.3830954994511526e-07, + "loss": 1.2731, + "mean_token_accuracy": 0.639190137386322, + "num_tokens": 6442761.0, + "step": 253 + }, + { + "epoch": 0.027893696463869976, + "grad_norm": 5.532503128051758, + "learning_rate": 1.3885839736553238e-07, + "loss": 1.1169, + "mean_token_accuracy": 0.6804842948913574, + "num_tokens": 6463721.0, + "step": 254 + }, + { + "epoch": 0.028003514166483636, + "grad_norm": 4.900204181671143, + "learning_rate": 1.394072447859495e-07, + "loss": 1.2136, + "mean_token_accuracy": 0.6549437046051025, + "num_tokens": 6490120.0, + "step": 255 + }, + { + "epoch": 0.0281133318690973, + "grad_norm": 5.193093776702881, + "learning_rate": 1.399560922063666e-07, + "loss": 1.1936, + "mean_token_accuracy": 0.6633338928222656, + "num_tokens": 6512775.0, + "step": 256 + }, + { + "epoch": 0.02822314957171096, + "grad_norm": 5.517599105834961, + "learning_rate": 1.4050493962678375e-07, + "loss": 1.2991, + "mean_token_accuracy": 0.6368288993835449, + "num_tokens": 6538916.0, + "step": 257 + }, + { + "epoch": 0.028332967274324623, + "grad_norm": 5.492110729217529, + "learning_rate": 1.4105378704720087e-07, + "loss": 1.2214, + "mean_token_accuracy": 0.6634722948074341, + "num_tokens": 6564170.0, + "step": 258 + }, + { + "epoch": 0.028442784976938282, + "grad_norm": 5.747128009796143, + "learning_rate": 1.41602634467618e-07, + "loss": 1.2573, + "mean_token_accuracy": 0.6634171009063721, + "num_tokens": 6586536.0, + "step": 259 + }, + { + "epoch": 0.028552602679551942, + "grad_norm": 8.23257827758789, + "learning_rate": 1.421514818880351e-07, + "loss": 1.2689, + "mean_token_accuracy": 0.6446000933647156, + "num_tokens": 6608249.0, + "step": 260 + }, + { + "epoch": 0.028662420382165606, + "grad_norm": 4.492578029632568, + "learning_rate": 1.4270032930845224e-07, + "loss": 1.209, + "mean_token_accuracy": 0.659411609172821, + "num_tokens": 6638597.0, + "step": 261 + }, + { + "epoch": 0.028772238084779266, + "grad_norm": 5.017581462860107, + "learning_rate": 1.4324917672886938e-07, + "loss": 1.2786, + "mean_token_accuracy": 0.6380189657211304, + "num_tokens": 6663725.0, + "step": 262 + }, + { + "epoch": 0.02888205578739293, + "grad_norm": 5.385952949523926, + "learning_rate": 1.437980241492865e-07, + "loss": 1.188, + "mean_token_accuracy": 0.6633579134941101, + "num_tokens": 6688830.0, + "step": 263 + }, + { + "epoch": 0.02899187349000659, + "grad_norm": 5.893916130065918, + "learning_rate": 1.4434687156970364e-07, + "loss": 1.2688, + "mean_token_accuracy": 0.6422685384750366, + "num_tokens": 6714050.0, + "step": 264 + }, + { + "epoch": 0.029101691192620252, + "grad_norm": 5.0674004554748535, + "learning_rate": 1.4489571899012073e-07, + "loss": 1.2775, + "mean_token_accuracy": 0.644943118095398, + "num_tokens": 6736470.0, + "step": 265 + }, + { + "epoch": 0.029211508895233912, + "grad_norm": 5.167535305023193, + "learning_rate": 1.4544456641053787e-07, + "loss": 1.184, + "mean_token_accuracy": 0.6676405668258667, + "num_tokens": 6762762.0, + "step": 266 + }, + { + "epoch": 0.029321326597847572, + "grad_norm": 7.8241119384765625, + "learning_rate": 1.4599341383095498e-07, + "loss": 1.3261, + "mean_token_accuracy": 0.6390317678451538, + "num_tokens": 6788172.0, + "step": 267 + }, + { + "epoch": 0.029431144300461235, + "grad_norm": 4.557558059692383, + "learning_rate": 1.4654226125137212e-07, + "loss": 1.2462, + "mean_token_accuracy": 0.6534675359725952, + "num_tokens": 6815292.0, + "step": 268 + }, + { + "epoch": 0.029540962003074895, + "grad_norm": 5.149682521820068, + "learning_rate": 1.4709110867178924e-07, + "loss": 1.2309, + "mean_token_accuracy": 0.6543508768081665, + "num_tokens": 6841897.0, + "step": 269 + }, + { + "epoch": 0.02965077970568856, + "grad_norm": 5.705262660980225, + "learning_rate": 1.4763995609220636e-07, + "loss": 1.266, + "mean_token_accuracy": 0.6478369235992432, + "num_tokens": 6872115.0, + "step": 270 + }, + { + "epoch": 0.029760597408302218, + "grad_norm": 5.516839981079102, + "learning_rate": 1.4818880351262347e-07, + "loss": 1.2903, + "mean_token_accuracy": 0.6429321765899658, + "num_tokens": 6898760.0, + "step": 271 + }, + { + "epoch": 0.029870415110915878, + "grad_norm": 5.042950630187988, + "learning_rate": 1.487376509330406e-07, + "loss": 1.1734, + "mean_token_accuracy": 0.673283576965332, + "num_tokens": 6922982.0, + "step": 272 + }, + { + "epoch": 0.02998023281352954, + "grad_norm": 6.524009704589844, + "learning_rate": 1.4928649835345773e-07, + "loss": 1.3016, + "mean_token_accuracy": 0.6569399833679199, + "num_tokens": 6947600.0, + "step": 273 + }, + { + "epoch": 0.0300900505161432, + "grad_norm": 5.5010294914245605, + "learning_rate": 1.4983534577387484e-07, + "loss": 1.1883, + "mean_token_accuracy": 0.666557788848877, + "num_tokens": 6974337.0, + "step": 274 + }, + { + "epoch": 0.030199868218756865, + "grad_norm": 8.109553337097168, + "learning_rate": 1.5038419319429198e-07, + "loss": 1.2168, + "mean_token_accuracy": 0.6473851799964905, + "num_tokens": 7000048.0, + "step": 275 + }, + { + "epoch": 0.030309685921370524, + "grad_norm": 5.431443691253662, + "learning_rate": 1.509330406147091e-07, + "loss": 1.111, + "mean_token_accuracy": 0.676626443862915, + "num_tokens": 7023669.0, + "step": 276 + }, + { + "epoch": 0.030419503623984188, + "grad_norm": 5.892745018005371, + "learning_rate": 1.5148188803512624e-07, + "loss": 1.2648, + "mean_token_accuracy": 0.6456499695777893, + "num_tokens": 7048680.0, + "step": 277 + }, + { + "epoch": 0.030529321326597848, + "grad_norm": 5.007660865783691, + "learning_rate": 1.5203073545554336e-07, + "loss": 1.2499, + "mean_token_accuracy": 0.6509580612182617, + "num_tokens": 7074912.0, + "step": 278 + }, + { + "epoch": 0.030639139029211507, + "grad_norm": 6.04888391494751, + "learning_rate": 1.5257958287596047e-07, + "loss": 1.1962, + "mean_token_accuracy": 0.6621364951133728, + "num_tokens": 7098564.0, + "step": 279 + }, + { + "epoch": 0.03074895673182517, + "grad_norm": 6.511871814727783, + "learning_rate": 1.531284302963776e-07, + "loss": 1.1845, + "mean_token_accuracy": 0.6689218282699585, + "num_tokens": 7122759.0, + "step": 280 + }, + { + "epoch": 0.03085877443443883, + "grad_norm": 4.343351364135742, + "learning_rate": 1.5367727771679473e-07, + "loss": 1.3182, + "mean_token_accuracy": 0.6306874752044678, + "num_tokens": 7153674.0, + "step": 281 + }, + { + "epoch": 0.030968592137052494, + "grad_norm": 4.625445365905762, + "learning_rate": 1.5422612513721184e-07, + "loss": 1.2633, + "mean_token_accuracy": 0.6474898457527161, + "num_tokens": 7183307.0, + "step": 282 + }, + { + "epoch": 0.031078409839666154, + "grad_norm": 5.143370628356934, + "learning_rate": 1.5477497255762899e-07, + "loss": 1.2094, + "mean_token_accuracy": 0.6577804684638977, + "num_tokens": 7212234.0, + "step": 283 + }, + { + "epoch": 0.031188227542279817, + "grad_norm": 5.020436763763428, + "learning_rate": 1.5532381997804607e-07, + "loss": 1.2476, + "mean_token_accuracy": 0.6506901383399963, + "num_tokens": 7238995.0, + "step": 284 + }, + { + "epoch": 0.03129804524489348, + "grad_norm": 7.381127834320068, + "learning_rate": 1.5587266739846322e-07, + "loss": 1.3091, + "mean_token_accuracy": 0.6467322707176208, + "num_tokens": 7262477.0, + "step": 285 + }, + { + "epoch": 0.03140786294750714, + "grad_norm": 5.481541156768799, + "learning_rate": 1.5642151481888036e-07, + "loss": 1.1348, + "mean_token_accuracy": 0.6732155084609985, + "num_tokens": 7282310.0, + "step": 286 + }, + { + "epoch": 0.0315176806501208, + "grad_norm": 5.900959491729736, + "learning_rate": 1.5697036223929747e-07, + "loss": 1.189, + "mean_token_accuracy": 0.6632980704307556, + "num_tokens": 7302768.0, + "step": 287 + }, + { + "epoch": 0.031627498352734464, + "grad_norm": 4.540388584136963, + "learning_rate": 1.5751920965971461e-07, + "loss": 1.2455, + "mean_token_accuracy": 0.6523578763008118, + "num_tokens": 7335823.0, + "step": 288 + }, + { + "epoch": 0.03173731605534812, + "grad_norm": 5.5649261474609375, + "learning_rate": 1.580680570801317e-07, + "loss": 1.1915, + "mean_token_accuracy": 0.6491826772689819, + "num_tokens": 7359238.0, + "step": 289 + }, + { + "epoch": 0.03184713375796178, + "grad_norm": 4.944965839385986, + "learning_rate": 1.5861690450054885e-07, + "loss": 1.1857, + "mean_token_accuracy": 0.6626017093658447, + "num_tokens": 7388476.0, + "step": 290 + }, + { + "epoch": 0.03195695146057544, + "grad_norm": 6.168661594390869, + "learning_rate": 1.5916575192096596e-07, + "loss": 1.2598, + "mean_token_accuracy": 0.6503548622131348, + "num_tokens": 7413036.0, + "step": 291 + }, + { + "epoch": 0.0320667691631891, + "grad_norm": 5.349205493927002, + "learning_rate": 1.597145993413831e-07, + "loss": 1.3346, + "mean_token_accuracy": 0.6271962523460388, + "num_tokens": 7445538.0, + "step": 292 + }, + { + "epoch": 0.03217658686580277, + "grad_norm": 5.999242305755615, + "learning_rate": 1.602634467618002e-07, + "loss": 1.2383, + "mean_token_accuracy": 0.6576793789863586, + "num_tokens": 7471506.0, + "step": 293 + }, + { + "epoch": 0.03228640456841643, + "grad_norm": 5.147767066955566, + "learning_rate": 1.6081229418221733e-07, + "loss": 1.2501, + "mean_token_accuracy": 0.6470666527748108, + "num_tokens": 7497800.0, + "step": 294 + }, + { + "epoch": 0.03239622227103009, + "grad_norm": 4.932147026062012, + "learning_rate": 1.6136114160263445e-07, + "loss": 1.1688, + "mean_token_accuracy": 0.664297878742218, + "num_tokens": 7525081.0, + "step": 295 + }, + { + "epoch": 0.03250603997364375, + "grad_norm": 4.422874450683594, + "learning_rate": 1.619099890230516e-07, + "loss": 1.1653, + "mean_token_accuracy": 0.66200852394104, + "num_tokens": 7553624.0, + "step": 296 + }, + { + "epoch": 0.03261585767625741, + "grad_norm": 4.766824245452881, + "learning_rate": 1.6245883644346873e-07, + "loss": 1.2506, + "mean_token_accuracy": 0.6422881484031677, + "num_tokens": 7579361.0, + "step": 297 + }, + { + "epoch": 0.032725675378871076, + "grad_norm": 5.384964466094971, + "learning_rate": 1.6300768386388582e-07, + "loss": 1.1161, + "mean_token_accuracy": 0.6808781623840332, + "num_tokens": 7602611.0, + "step": 298 + }, + { + "epoch": 0.032835493081484736, + "grad_norm": 5.394189834594727, + "learning_rate": 1.6355653128430296e-07, + "loss": 1.2861, + "mean_token_accuracy": 0.6454712748527527, + "num_tokens": 7624924.0, + "step": 299 + }, + { + "epoch": 0.032945310784098396, + "grad_norm": 4.205280303955078, + "learning_rate": 1.6410537870472008e-07, + "loss": 1.1813, + "mean_token_accuracy": 0.6632276773452759, + "num_tokens": 7655092.0, + "step": 300 + }, + { + "epoch": 0.033055128486712056, + "grad_norm": 4.6710638999938965, + "learning_rate": 1.6465422612513722e-07, + "loss": 1.154, + "mean_token_accuracy": 0.6666623950004578, + "num_tokens": 7677171.0, + "step": 301 + }, + { + "epoch": 0.03316494618932572, + "grad_norm": 5.9497199058532715, + "learning_rate": 1.6520307354555433e-07, + "loss": 1.1797, + "mean_token_accuracy": 0.6579101085662842, + "num_tokens": 7697087.0, + "step": 302 + }, + { + "epoch": 0.03327476389193938, + "grad_norm": 4.689944267272949, + "learning_rate": 1.6575192096597145e-07, + "loss": 1.2248, + "mean_token_accuracy": 0.6610757112503052, + "num_tokens": 7720963.0, + "step": 303 + }, + { + "epoch": 0.03338458159455304, + "grad_norm": 4.493658065795898, + "learning_rate": 1.6630076838638856e-07, + "loss": 1.2318, + "mean_token_accuracy": 0.6521607041358948, + "num_tokens": 7753965.0, + "step": 304 + }, + { + "epoch": 0.0334943992971667, + "grad_norm": 5.08533239364624, + "learning_rate": 1.668496158068057e-07, + "loss": 1.2126, + "mean_token_accuracy": 0.6518851518630981, + "num_tokens": 7780220.0, + "step": 305 + }, + { + "epoch": 0.03360421699978036, + "grad_norm": 4.020519256591797, + "learning_rate": 1.6739846322722282e-07, + "loss": 1.1548, + "mean_token_accuracy": 0.672480046749115, + "num_tokens": 7809518.0, + "step": 306 + }, + { + "epoch": 0.03371403470239403, + "grad_norm": 4.225880146026611, + "learning_rate": 1.6794731064763996e-07, + "loss": 1.2552, + "mean_token_accuracy": 0.6497206091880798, + "num_tokens": 7836592.0, + "step": 307 + }, + { + "epoch": 0.03382385240500769, + "grad_norm": 4.748560428619385, + "learning_rate": 1.6849615806805705e-07, + "loss": 1.2411, + "mean_token_accuracy": 0.6472052335739136, + "num_tokens": 7860233.0, + "step": 308 + }, + { + "epoch": 0.03393367010762135, + "grad_norm": 5.099686622619629, + "learning_rate": 1.690450054884742e-07, + "loss": 1.1926, + "mean_token_accuracy": 0.6630065441131592, + "num_tokens": 7883954.0, + "step": 309 + }, + { + "epoch": 0.03404348781023501, + "grad_norm": 4.5210041999816895, + "learning_rate": 1.6959385290889134e-07, + "loss": 1.242, + "mean_token_accuracy": 0.6539610028266907, + "num_tokens": 7914248.0, + "step": 310 + }, + { + "epoch": 0.03415330551284867, + "grad_norm": 4.404906749725342, + "learning_rate": 1.7014270032930845e-07, + "loss": 1.2375, + "mean_token_accuracy": 0.6524800062179565, + "num_tokens": 7947191.0, + "step": 311 + }, + { + "epoch": 0.034263123215462335, + "grad_norm": 4.167058944702148, + "learning_rate": 1.7069154774972557e-07, + "loss": 1.2526, + "mean_token_accuracy": 0.6433173418045044, + "num_tokens": 7975074.0, + "step": 312 + }, + { + "epoch": 0.034372940918075995, + "grad_norm": 4.651433944702148, + "learning_rate": 1.7124039517014268e-07, + "loss": 1.1477, + "mean_token_accuracy": 0.6777888536453247, + "num_tokens": 7999182.0, + "step": 313 + }, + { + "epoch": 0.034482758620689655, + "grad_norm": 3.9352259635925293, + "learning_rate": 1.7178924259055982e-07, + "loss": 1.2442, + "mean_token_accuracy": 0.6491891145706177, + "num_tokens": 8027671.0, + "step": 314 + }, + { + "epoch": 0.034592576323303315, + "grad_norm": 4.901159286499023, + "learning_rate": 1.7233809001097694e-07, + "loss": 1.1395, + "mean_token_accuracy": 0.6770652532577515, + "num_tokens": 8051766.0, + "step": 315 + }, + { + "epoch": 0.034702394025916974, + "grad_norm": 5.230990886688232, + "learning_rate": 1.7288693743139408e-07, + "loss": 1.1202, + "mean_token_accuracy": 0.6749662160873413, + "num_tokens": 8077215.0, + "step": 316 + }, + { + "epoch": 0.03481221172853064, + "grad_norm": 5.065467357635498, + "learning_rate": 1.7343578485181117e-07, + "loss": 1.1586, + "mean_token_accuracy": 0.6654307842254639, + "num_tokens": 8099532.0, + "step": 317 + }, + { + "epoch": 0.0349220294311443, + "grad_norm": 4.909440994262695, + "learning_rate": 1.739846322722283e-07, + "loss": 1.1992, + "mean_token_accuracy": 0.6671472787857056, + "num_tokens": 8126724.0, + "step": 318 + }, + { + "epoch": 0.03503184713375796, + "grad_norm": 5.1046295166015625, + "learning_rate": 1.7453347969264543e-07, + "loss": 1.1137, + "mean_token_accuracy": 0.675557553768158, + "num_tokens": 8152216.0, + "step": 319 + }, + { + "epoch": 0.03514166483637162, + "grad_norm": 5.25687837600708, + "learning_rate": 1.7508232711306257e-07, + "loss": 1.1744, + "mean_token_accuracy": 0.6687126755714417, + "num_tokens": 8173155.0, + "step": 320 + }, + { + "epoch": 0.03525148253898529, + "grad_norm": 5.561641216278076, + "learning_rate": 1.756311745334797e-07, + "loss": 1.2046, + "mean_token_accuracy": 0.6645928621292114, + "num_tokens": 8191969.0, + "step": 321 + }, + { + "epoch": 0.03536130024159895, + "grad_norm": 3.6987907886505127, + "learning_rate": 1.761800219538968e-07, + "loss": 1.2051, + "mean_token_accuracy": 0.6645966172218323, + "num_tokens": 8219862.0, + "step": 322 + }, + { + "epoch": 0.03547111794421261, + "grad_norm": 5.335272789001465, + "learning_rate": 1.7672886937431394e-07, + "loss": 1.1226, + "mean_token_accuracy": 0.6671441197395325, + "num_tokens": 8237857.0, + "step": 323 + }, + { + "epoch": 0.03558093564682627, + "grad_norm": 4.964507102966309, + "learning_rate": 1.7727771679473105e-07, + "loss": 1.2785, + "mean_token_accuracy": 0.6339964866638184, + "num_tokens": 8264820.0, + "step": 324 + }, + { + "epoch": 0.03569075334943993, + "grad_norm": 5.193889617919922, + "learning_rate": 1.778265642151482e-07, + "loss": 1.2902, + "mean_token_accuracy": 0.6420919895172119, + "num_tokens": 8289433.0, + "step": 325 + }, + { + "epoch": 0.035800571052053594, + "grad_norm": 4.693655490875244, + "learning_rate": 1.7837541163556529e-07, + "loss": 1.1174, + "mean_token_accuracy": 0.6752066612243652, + "num_tokens": 8311377.0, + "step": 326 + }, + { + "epoch": 0.035910388754667254, + "grad_norm": 3.6030380725860596, + "learning_rate": 1.7892425905598243e-07, + "loss": 1.2656, + "mean_token_accuracy": 0.6449101567268372, + "num_tokens": 8338773.0, + "step": 327 + }, + { + "epoch": 0.036020206457280914, + "grad_norm": 4.946138381958008, + "learning_rate": 1.7947310647639954e-07, + "loss": 1.1389, + "mean_token_accuracy": 0.6792609691619873, + "num_tokens": 8365540.0, + "step": 328 + }, + { + "epoch": 0.03613002415989457, + "grad_norm": 4.068286418914795, + "learning_rate": 1.8002195389681668e-07, + "loss": 1.2614, + "mean_token_accuracy": 0.6408399343490601, + "num_tokens": 8392347.0, + "step": 329 + }, + { + "epoch": 0.03623984186250823, + "grad_norm": 4.4881463050842285, + "learning_rate": 1.805708013172338e-07, + "loss": 1.3063, + "mean_token_accuracy": 0.6427128911018372, + "num_tokens": 8419669.0, + "step": 330 + }, + { + "epoch": 0.0363496595651219, + "grad_norm": 5.461554527282715, + "learning_rate": 1.8111964873765091e-07, + "loss": 1.3076, + "mean_token_accuracy": 0.6320650577545166, + "num_tokens": 8444478.0, + "step": 331 + }, + { + "epoch": 0.03645947726773556, + "grad_norm": 5.592175483703613, + "learning_rate": 1.8166849615806803e-07, + "loss": 1.2279, + "mean_token_accuracy": 0.6482008099555969, + "num_tokens": 8469724.0, + "step": 332 + }, + { + "epoch": 0.03656929497034922, + "grad_norm": 4.938872337341309, + "learning_rate": 1.8221734357848517e-07, + "loss": 1.1459, + "mean_token_accuracy": 0.6737239360809326, + "num_tokens": 8493160.0, + "step": 333 + }, + { + "epoch": 0.03667911267296288, + "grad_norm": 3.64882493019104, + "learning_rate": 1.827661909989023e-07, + "loss": 1.2097, + "mean_token_accuracy": 0.6604300737380981, + "num_tokens": 8524817.0, + "step": 334 + }, + { + "epoch": 0.03678893037557654, + "grad_norm": 5.182716369628906, + "learning_rate": 1.8331503841931943e-07, + "loss": 1.1285, + "mean_token_accuracy": 0.6791932582855225, + "num_tokens": 8545415.0, + "step": 335 + }, + { + "epoch": 0.036898748078190206, + "grad_norm": 4.545915603637695, + "learning_rate": 1.8386388583973654e-07, + "loss": 1.1705, + "mean_token_accuracy": 0.6636478900909424, + "num_tokens": 8564161.0, + "step": 336 + }, + { + "epoch": 0.037008565780803866, + "grad_norm": 4.262722492218018, + "learning_rate": 1.8441273326015366e-07, + "loss": 1.158, + "mean_token_accuracy": 0.6713969707489014, + "num_tokens": 8587689.0, + "step": 337 + }, + { + "epoch": 0.037118383483417526, + "grad_norm": 4.4470648765563965, + "learning_rate": 1.849615806805708e-07, + "loss": 1.1658, + "mean_token_accuracy": 0.6602307558059692, + "num_tokens": 8612920.0, + "step": 338 + }, + { + "epoch": 0.037228201186031186, + "grad_norm": 5.732725143432617, + "learning_rate": 1.8551042810098792e-07, + "loss": 1.1493, + "mean_token_accuracy": 0.671137809753418, + "num_tokens": 8637436.0, + "step": 339 + }, + { + "epoch": 0.03733801888864485, + "grad_norm": 4.437293529510498, + "learning_rate": 1.8605927552140506e-07, + "loss": 1.1261, + "mean_token_accuracy": 0.6793785095214844, + "num_tokens": 8657664.0, + "step": 340 + }, + { + "epoch": 0.03744783659125851, + "grad_norm": 4.550472736358643, + "learning_rate": 1.8660812294182215e-07, + "loss": 1.2408, + "mean_token_accuracy": 0.6546789407730103, + "num_tokens": 8682871.0, + "step": 341 + }, + { + "epoch": 0.03755765429387217, + "grad_norm": 4.254106044769287, + "learning_rate": 1.871569703622393e-07, + "loss": 1.2568, + "mean_token_accuracy": 0.6406398415565491, + "num_tokens": 8713274.0, + "step": 342 + }, + { + "epoch": 0.03766747199648583, + "grad_norm": 4.4705352783203125, + "learning_rate": 1.877058177826564e-07, + "loss": 1.1704, + "mean_token_accuracy": 0.6677408218383789, + "num_tokens": 8737111.0, + "step": 343 + }, + { + "epoch": 0.03777728969909949, + "grad_norm": 4.905544757843018, + "learning_rate": 1.8825466520307354e-07, + "loss": 1.2439, + "mean_token_accuracy": 0.6477274894714355, + "num_tokens": 8761421.0, + "step": 344 + }, + { + "epoch": 0.03788710740171316, + "grad_norm": 5.380320072174072, + "learning_rate": 1.8880351262349069e-07, + "loss": 1.1149, + "mean_token_accuracy": 0.6756962537765503, + "num_tokens": 8780347.0, + "step": 345 + }, + { + "epoch": 0.03799692510432682, + "grad_norm": 3.799220085144043, + "learning_rate": 1.8935236004390778e-07, + "loss": 1.2644, + "mean_token_accuracy": 0.6427857875823975, + "num_tokens": 8809479.0, + "step": 346 + }, + { + "epoch": 0.03810674280694048, + "grad_norm": 4.397911071777344, + "learning_rate": 1.8990120746432492e-07, + "loss": 1.267, + "mean_token_accuracy": 0.6381064057350159, + "num_tokens": 8838836.0, + "step": 347 + }, + { + "epoch": 0.03821656050955414, + "grad_norm": 4.7146687507629395, + "learning_rate": 1.9045005488474203e-07, + "loss": 1.2324, + "mean_token_accuracy": 0.6541329622268677, + "num_tokens": 8870887.0, + "step": 348 + }, + { + "epoch": 0.0383263782121678, + "grad_norm": 4.299011707305908, + "learning_rate": 1.9099890230515917e-07, + "loss": 1.1716, + "mean_token_accuracy": 0.6594277620315552, + "num_tokens": 8893506.0, + "step": 349 + }, + { + "epoch": 0.038436195914781465, + "grad_norm": 4.526052951812744, + "learning_rate": 1.9154774972557626e-07, + "loss": 1.0878, + "mean_token_accuracy": 0.678132176399231, + "num_tokens": 8912583.0, + "step": 350 + }, + { + "epoch": 0.038546013617395125, + "grad_norm": 4.676695823669434, + "learning_rate": 1.920965971459934e-07, + "loss": 1.2143, + "mean_token_accuracy": 0.6687222123146057, + "num_tokens": 8937271.0, + "step": 351 + }, + { + "epoch": 0.038655831320008785, + "grad_norm": 4.234563827514648, + "learning_rate": 1.9264544456641052e-07, + "loss": 1.1152, + "mean_token_accuracy": 0.6756932735443115, + "num_tokens": 8960517.0, + "step": 352 + }, + { + "epoch": 0.038765649022622445, + "grad_norm": 4.085710048675537, + "learning_rate": 1.9319429198682766e-07, + "loss": 1.2971, + "mean_token_accuracy": 0.6375161409378052, + "num_tokens": 8989909.0, + "step": 353 + }, + { + "epoch": 0.038875466725236105, + "grad_norm": 4.791993618011475, + "learning_rate": 1.9374313940724478e-07, + "loss": 1.1741, + "mean_token_accuracy": 0.6640819907188416, + "num_tokens": 9012273.0, + "step": 354 + }, + { + "epoch": 0.03898528442784977, + "grad_norm": 4.419633388519287, + "learning_rate": 1.942919868276619e-07, + "loss": 1.2362, + "mean_token_accuracy": 0.6489558219909668, + "num_tokens": 9034421.0, + "step": 355 + }, + { + "epoch": 0.03909510213046343, + "grad_norm": 4.961784839630127, + "learning_rate": 1.9484083424807903e-07, + "loss": 1.1661, + "mean_token_accuracy": 0.6678944230079651, + "num_tokens": 9061456.0, + "step": 356 + }, + { + "epoch": 0.03920491983307709, + "grad_norm": 4.558017253875732, + "learning_rate": 1.9538968166849615e-07, + "loss": 1.2624, + "mean_token_accuracy": 0.652471125125885, + "num_tokens": 9087486.0, + "step": 357 + }, + { + "epoch": 0.03931473753569075, + "grad_norm": 4.28415584564209, + "learning_rate": 1.959385290889133e-07, + "loss": 1.1647, + "mean_token_accuracy": 0.6684678792953491, + "num_tokens": 9111634.0, + "step": 358 + }, + { + "epoch": 0.03942455523830442, + "grad_norm": 4.642298221588135, + "learning_rate": 1.964873765093304e-07, + "loss": 1.1405, + "mean_token_accuracy": 0.6687743663787842, + "num_tokens": 9135109.0, + "step": 359 + }, + { + "epoch": 0.03953437294091808, + "grad_norm": 4.1859612464904785, + "learning_rate": 1.9703622392974752e-07, + "loss": 1.1741, + "mean_token_accuracy": 0.6737439632415771, + "num_tokens": 9157947.0, + "step": 360 + }, + { + "epoch": 0.03964419064353174, + "grad_norm": 4.008144855499268, + "learning_rate": 1.9758507135016464e-07, + "loss": 1.1886, + "mean_token_accuracy": 0.6559889316558838, + "num_tokens": 9182774.0, + "step": 361 + }, + { + "epoch": 0.0397540083461454, + "grad_norm": 3.8857927322387695, + "learning_rate": 1.9813391877058178e-07, + "loss": 1.2022, + "mean_token_accuracy": 0.6535874605178833, + "num_tokens": 9211017.0, + "step": 362 + }, + { + "epoch": 0.03986382604875906, + "grad_norm": 3.710855484008789, + "learning_rate": 1.986827661909989e-07, + "loss": 1.2105, + "mean_token_accuracy": 0.6549350023269653, + "num_tokens": 9241102.0, + "step": 363 + }, + { + "epoch": 0.039973643751372724, + "grad_norm": 4.016488075256348, + "learning_rate": 1.9923161361141603e-07, + "loss": 1.2214, + "mean_token_accuracy": 0.6652200222015381, + "num_tokens": 9266264.0, + "step": 364 + }, + { + "epoch": 0.040083461453986384, + "grad_norm": 3.210458517074585, + "learning_rate": 1.9978046103183312e-07, + "loss": 1.2611, + "mean_token_accuracy": 0.6430490612983704, + "num_tokens": 9296024.0, + "step": 365 + }, + { + "epoch": 0.040193279156600044, + "grad_norm": 5.118428707122803, + "learning_rate": 2.0032930845225027e-07, + "loss": 1.2257, + "mean_token_accuracy": 0.6456429958343506, + "num_tokens": 9316931.0, + "step": 366 + }, + { + "epoch": 0.040303096859213704, + "grad_norm": 3.643491744995117, + "learning_rate": 2.0087815587266738e-07, + "loss": 1.1721, + "mean_token_accuracy": 0.6770370602607727, + "num_tokens": 9346321.0, + "step": 367 + }, + { + "epoch": 0.040412914561827364, + "grad_norm": 4.3423991203308105, + "learning_rate": 2.0142700329308452e-07, + "loss": 1.171, + "mean_token_accuracy": 0.6614339351654053, + "num_tokens": 9372303.0, + "step": 368 + }, + { + "epoch": 0.04052273226444103, + "grad_norm": 3.7878849506378174, + "learning_rate": 2.0197585071350164e-07, + "loss": 1.1931, + "mean_token_accuracy": 0.6824682950973511, + "num_tokens": 9396717.0, + "step": 369 + }, + { + "epoch": 0.04063254996705469, + "grad_norm": 3.821153163909912, + "learning_rate": 2.0252469813391875e-07, + "loss": 1.0552, + "mean_token_accuracy": 0.6980278491973877, + "num_tokens": 9421262.0, + "step": 370 + }, + { + "epoch": 0.04074236766966835, + "grad_norm": 4.535167694091797, + "learning_rate": 2.030735455543359e-07, + "loss": 1.1233, + "mean_token_accuracy": 0.6923707723617554, + "num_tokens": 9445052.0, + "step": 371 + }, + { + "epoch": 0.04085218537228201, + "grad_norm": 3.020939826965332, + "learning_rate": 2.03622392974753e-07, + "loss": 1.1522, + "mean_token_accuracy": 0.6666748523712158, + "num_tokens": 9476906.0, + "step": 372 + }, + { + "epoch": 0.04096200307489567, + "grad_norm": 4.148868083953857, + "learning_rate": 2.0417124039517015e-07, + "loss": 1.1863, + "mean_token_accuracy": 0.6689727902412415, + "num_tokens": 9500350.0, + "step": 373 + }, + { + "epoch": 0.04107182077750934, + "grad_norm": 3.0401268005371094, + "learning_rate": 2.0472008781558724e-07, + "loss": 1.1809, + "mean_token_accuracy": 0.663672685623169, + "num_tokens": 9535738.0, + "step": 374 + }, + { + "epoch": 0.041181638480122996, + "grad_norm": 3.5066888332366943, + "learning_rate": 2.0526893523600438e-07, + "loss": 1.246, + "mean_token_accuracy": 0.649204432964325, + "num_tokens": 9565682.0, + "step": 375 + }, + { + "epoch": 0.041291456182736656, + "grad_norm": 3.5591325759887695, + "learning_rate": 2.058177826564215e-07, + "loss": 1.21, + "mean_token_accuracy": 0.6589028835296631, + "num_tokens": 9595414.0, + "step": 376 + }, + { + "epoch": 0.041401273885350316, + "grad_norm": 5.832372188568115, + "learning_rate": 2.0636663007683864e-07, + "loss": 1.2157, + "mean_token_accuracy": 0.6526602506637573, + "num_tokens": 9614615.0, + "step": 377 + }, + { + "epoch": 0.04151109158796398, + "grad_norm": 4.369155406951904, + "learning_rate": 2.0691547749725575e-07, + "loss": 1.2314, + "mean_token_accuracy": 0.6472725868225098, + "num_tokens": 9640106.0, + "step": 378 + }, + { + "epoch": 0.04162090929057764, + "grad_norm": 3.4279398918151855, + "learning_rate": 2.0746432491767287e-07, + "loss": 1.2432, + "mean_token_accuracy": 0.6377360224723816, + "num_tokens": 9665972.0, + "step": 379 + }, + { + "epoch": 0.0417307269931913, + "grad_norm": 3.4615700244903564, + "learning_rate": 2.0801317233809e-07, + "loss": 1.2282, + "mean_token_accuracy": 0.6523503065109253, + "num_tokens": 9693186.0, + "step": 380 + }, + { + "epoch": 0.04184054469580496, + "grad_norm": 3.8563334941864014, + "learning_rate": 2.0856201975850713e-07, + "loss": 1.1857, + "mean_token_accuracy": 0.6684091687202454, + "num_tokens": 9725171.0, + "step": 381 + }, + { + "epoch": 0.04195036239841862, + "grad_norm": 3.768754005432129, + "learning_rate": 2.0911086717892427e-07, + "loss": 1.1492, + "mean_token_accuracy": 0.6642647981643677, + "num_tokens": 9750492.0, + "step": 382 + }, + { + "epoch": 0.04206018010103229, + "grad_norm": 3.3687946796417236, + "learning_rate": 2.0965971459934136e-07, + "loss": 1.2051, + "mean_token_accuracy": 0.6641743183135986, + "num_tokens": 9779136.0, + "step": 383 + }, + { + "epoch": 0.04216999780364595, + "grad_norm": 3.5496182441711426, + "learning_rate": 2.102085620197585e-07, + "loss": 1.1562, + "mean_token_accuracy": 0.6673194766044617, + "num_tokens": 9805387.0, + "step": 384 + }, + { + "epoch": 0.04227981550625961, + "grad_norm": 4.212862491607666, + "learning_rate": 2.1075740944017561e-07, + "loss": 1.1634, + "mean_token_accuracy": 0.6643638610839844, + "num_tokens": 9830283.0, + "step": 385 + }, + { + "epoch": 0.04238963320887327, + "grad_norm": 4.210342884063721, + "learning_rate": 2.1130625686059276e-07, + "loss": 1.1926, + "mean_token_accuracy": 0.6559033393859863, + "num_tokens": 9855143.0, + "step": 386 + }, + { + "epoch": 0.04249945091148693, + "grad_norm": 3.436352491378784, + "learning_rate": 2.1185510428100987e-07, + "loss": 1.1951, + "mean_token_accuracy": 0.6605526208877563, + "num_tokens": 9882320.0, + "step": 387 + }, + { + "epoch": 0.042609268614100596, + "grad_norm": 3.9491188526153564, + "learning_rate": 2.1240395170142699e-07, + "loss": 1.1688, + "mean_token_accuracy": 0.6644600629806519, + "num_tokens": 9908664.0, + "step": 388 + }, + { + "epoch": 0.042719086316714255, + "grad_norm": 3.6913793087005615, + "learning_rate": 2.129527991218441e-07, + "loss": 1.1359, + "mean_token_accuracy": 0.6755316853523254, + "num_tokens": 9930833.0, + "step": 389 + }, + { + "epoch": 0.042828904019327915, + "grad_norm": 4.55180549621582, + "learning_rate": 2.1350164654226124e-07, + "loss": 1.1497, + "mean_token_accuracy": 0.6639282703399658, + "num_tokens": 9949898.0, + "step": 390 + }, + { + "epoch": 0.042938721721941575, + "grad_norm": 3.6078104972839355, + "learning_rate": 2.1405049396267836e-07, + "loss": 1.1484, + "mean_token_accuracy": 0.6685201525688171, + "num_tokens": 9977229.0, + "step": 391 + }, + { + "epoch": 0.043048539424555235, + "grad_norm": 3.668510913848877, + "learning_rate": 2.145993413830955e-07, + "loss": 1.1199, + "mean_token_accuracy": 0.6750396490097046, + "num_tokens": 10002741.0, + "step": 392 + }, + { + "epoch": 0.0431583571271689, + "grad_norm": 4.034297943115234, + "learning_rate": 2.1514818880351262e-07, + "loss": 1.119, + "mean_token_accuracy": 0.6794573068618774, + "num_tokens": 10025016.0, + "step": 393 + }, + { + "epoch": 0.04326817482978256, + "grad_norm": 3.5652334690093994, + "learning_rate": 2.1569703622392973e-07, + "loss": 1.2348, + "mean_token_accuracy": 0.6460207104682922, + "num_tokens": 10050029.0, + "step": 394 + }, + { + "epoch": 0.04337799253239622, + "grad_norm": 3.882769823074341, + "learning_rate": 2.1624588364434687e-07, + "loss": 1.1935, + "mean_token_accuracy": 0.6753181219100952, + "num_tokens": 10078664.0, + "step": 395 + }, + { + "epoch": 0.04348781023500988, + "grad_norm": 3.234818458557129, + "learning_rate": 2.16794731064764e-07, + "loss": 1.2795, + "mean_token_accuracy": 0.64984530210495, + "num_tokens": 10104803.0, + "step": 396 + }, + { + "epoch": 0.04359762793762355, + "grad_norm": 4.6359333992004395, + "learning_rate": 2.1734357848518113e-07, + "loss": 1.1063, + "mean_token_accuracy": 0.6830635070800781, + "num_tokens": 10122877.0, + "step": 397 + }, + { + "epoch": 0.04370744564023721, + "grad_norm": 3.6053645610809326, + "learning_rate": 2.1789242590559822e-07, + "loss": 1.2521, + "mean_token_accuracy": 0.6419405341148376, + "num_tokens": 10150859.0, + "step": 398 + }, + { + "epoch": 0.04381726334285087, + "grad_norm": 2.935767889022827, + "learning_rate": 2.1844127332601536e-07, + "loss": 1.2183, + "mean_token_accuracy": 0.6496689915657043, + "num_tokens": 10183676.0, + "step": 399 + }, + { + "epoch": 0.04392708104546453, + "grad_norm": 3.151416778564453, + "learning_rate": 2.1899012074643247e-07, + "loss": 1.1314, + "mean_token_accuracy": 0.6728666424751282, + "num_tokens": 10213062.0, + "step": 400 + }, + { + "epoch": 0.04403689874807819, + "grad_norm": 3.7536239624023438, + "learning_rate": 2.1953896816684962e-07, + "loss": 1.0759, + "mean_token_accuracy": 0.6885571479797363, + "num_tokens": 10236049.0, + "step": 401 + }, + { + "epoch": 0.044146716450691854, + "grad_norm": 3.2735540866851807, + "learning_rate": 2.200878155872667e-07, + "loss": 1.1971, + "mean_token_accuracy": 0.6602115631103516, + "num_tokens": 10267449.0, + "step": 402 + }, + { + "epoch": 0.044256534153305514, + "grad_norm": 3.413952350616455, + "learning_rate": 2.2063666300768385e-07, + "loss": 1.2351, + "mean_token_accuracy": 0.6453385353088379, + "num_tokens": 10293114.0, + "step": 403 + }, + { + "epoch": 0.044366351855919174, + "grad_norm": 3.777785539627075, + "learning_rate": 2.21185510428101e-07, + "loss": 1.3212, + "mean_token_accuracy": 0.6366878747940063, + "num_tokens": 10321878.0, + "step": 404 + }, + { + "epoch": 0.044476169558532834, + "grad_norm": 3.0434064865112305, + "learning_rate": 2.217343578485181e-07, + "loss": 1.2222, + "mean_token_accuracy": 0.6531875729560852, + "num_tokens": 10351888.0, + "step": 405 + }, + { + "epoch": 0.044585987261146494, + "grad_norm": 3.5009679794311523, + "learning_rate": 2.2228320526893525e-07, + "loss": 1.2451, + "mean_token_accuracy": 0.6460450291633606, + "num_tokens": 10378693.0, + "step": 406 + }, + { + "epoch": 0.04469580496376016, + "grad_norm": 3.0665030479431152, + "learning_rate": 2.2283205268935233e-07, + "loss": 1.172, + "mean_token_accuracy": 0.6719194650650024, + "num_tokens": 10406696.0, + "step": 407 + }, + { + "epoch": 0.04480562266637382, + "grad_norm": 3.3878118991851807, + "learning_rate": 2.2338090010976948e-07, + "loss": 1.0926, + "mean_token_accuracy": 0.6853681206703186, + "num_tokens": 10430601.0, + "step": 408 + }, + { + "epoch": 0.04491544036898748, + "grad_norm": 3.240556240081787, + "learning_rate": 2.239297475301866e-07, + "loss": 1.1278, + "mean_token_accuracy": 0.6730022430419922, + "num_tokens": 10455772.0, + "step": 409 + }, + { + "epoch": 0.04502525807160114, + "grad_norm": 3.2028093338012695, + "learning_rate": 2.2447859495060373e-07, + "loss": 1.1907, + "mean_token_accuracy": 0.6674685478210449, + "num_tokens": 10480785.0, + "step": 410 + }, + { + "epoch": 0.0451350757742148, + "grad_norm": 3.453604221343994, + "learning_rate": 2.2502744237102085e-07, + "loss": 1.1861, + "mean_token_accuracy": 0.6663822531700134, + "num_tokens": 10508918.0, + "step": 411 + }, + { + "epoch": 0.04524489347682847, + "grad_norm": 3.780116081237793, + "learning_rate": 2.2557628979143796e-07, + "loss": 1.1603, + "mean_token_accuracy": 0.6769781708717346, + "num_tokens": 10531219.0, + "step": 412 + }, + { + "epoch": 0.04535471117944213, + "grad_norm": 3.38691782951355, + "learning_rate": 2.2612513721185508e-07, + "loss": 1.1559, + "mean_token_accuracy": 0.6623808145523071, + "num_tokens": 10556191.0, + "step": 413 + }, + { + "epoch": 0.04546452888205579, + "grad_norm": 4.388897895812988, + "learning_rate": 2.2667398463227222e-07, + "loss": 1.1364, + "mean_token_accuracy": 0.6755678057670593, + "num_tokens": 10574403.0, + "step": 414 + }, + { + "epoch": 0.045574346584669446, + "grad_norm": 4.133070468902588, + "learning_rate": 2.2722283205268934e-07, + "loss": 1.1882, + "mean_token_accuracy": 0.6563827395439148, + "num_tokens": 10600561.0, + "step": 415 + }, + { + "epoch": 0.04568416428728311, + "grad_norm": 3.1719977855682373, + "learning_rate": 2.2777167947310648e-07, + "loss": 1.1943, + "mean_token_accuracy": 0.6604108214378357, + "num_tokens": 10630815.0, + "step": 416 + }, + { + "epoch": 0.04579398198989677, + "grad_norm": 4.5058183670043945, + "learning_rate": 2.283205268935236e-07, + "loss": 1.1786, + "mean_token_accuracy": 0.6608145236968994, + "num_tokens": 10651902.0, + "step": 417 + }, + { + "epoch": 0.04590379969251043, + "grad_norm": 3.084299087524414, + "learning_rate": 2.288693743139407e-07, + "loss": 1.1713, + "mean_token_accuracy": 0.6734328866004944, + "num_tokens": 10682418.0, + "step": 418 + }, + { + "epoch": 0.04601361739512409, + "grad_norm": 3.405825614929199, + "learning_rate": 2.2941822173435785e-07, + "loss": 1.1564, + "mean_token_accuracy": 0.6674414277076721, + "num_tokens": 10710194.0, + "step": 419 + }, + { + "epoch": 0.04612343509773775, + "grad_norm": 3.4039387702941895, + "learning_rate": 2.2996706915477496e-07, + "loss": 1.1445, + "mean_token_accuracy": 0.6706306338310242, + "num_tokens": 10732623.0, + "step": 420 + }, + { + "epoch": 0.04623325280035142, + "grad_norm": 3.511167049407959, + "learning_rate": 2.3051591657519208e-07, + "loss": 1.1023, + "mean_token_accuracy": 0.675940990447998, + "num_tokens": 10757154.0, + "step": 421 + }, + { + "epoch": 0.04634307050296508, + "grad_norm": 3.891186237335205, + "learning_rate": 2.310647639956092e-07, + "loss": 1.2888, + "mean_token_accuracy": 0.633573055267334, + "num_tokens": 10781227.0, + "step": 422 + }, + { + "epoch": 0.04645288820557874, + "grad_norm": 3.663708209991455, + "learning_rate": 2.3161361141602634e-07, + "loss": 1.1986, + "mean_token_accuracy": 0.6633460521697998, + "num_tokens": 10804863.0, + "step": 423 + }, + { + "epoch": 0.0465627059081924, + "grad_norm": 4.151607036590576, + "learning_rate": 2.3216245883644345e-07, + "loss": 1.1152, + "mean_token_accuracy": 0.6766278743743896, + "num_tokens": 10825004.0, + "step": 424 + }, + { + "epoch": 0.04667252361080606, + "grad_norm": 3.843153715133667, + "learning_rate": 2.327113062568606e-07, + "loss": 1.1878, + "mean_token_accuracy": 0.657757580280304, + "num_tokens": 10846903.0, + "step": 425 + }, + { + "epoch": 0.046782341313419726, + "grad_norm": 3.613684892654419, + "learning_rate": 2.3326015367727768e-07, + "loss": 1.0904, + "mean_token_accuracy": 0.6852306127548218, + "num_tokens": 10871464.0, + "step": 426 + }, + { + "epoch": 0.046892159016033386, + "grad_norm": 3.7147536277770996, + "learning_rate": 2.3380900109769482e-07, + "loss": 1.2356, + "mean_token_accuracy": 0.662010133266449, + "num_tokens": 10896495.0, + "step": 427 + }, + { + "epoch": 0.047001976718647046, + "grad_norm": 3.684995174407959, + "learning_rate": 2.3435784851811197e-07, + "loss": 1.1275, + "mean_token_accuracy": 0.6693975925445557, + "num_tokens": 10923120.0, + "step": 428 + }, + { + "epoch": 0.047111794421260705, + "grad_norm": 3.1970791816711426, + "learning_rate": 2.3490669593852908e-07, + "loss": 1.1428, + "mean_token_accuracy": 0.6690468788146973, + "num_tokens": 10951330.0, + "step": 429 + }, + { + "epoch": 0.047221612123874365, + "grad_norm": 3.0749592781066895, + "learning_rate": 2.3545554335894622e-07, + "loss": 1.2602, + "mean_token_accuracy": 0.6386484503746033, + "num_tokens": 10983066.0, + "step": 430 + }, + { + "epoch": 0.04733142982648803, + "grad_norm": 3.4227683544158936, + "learning_rate": 2.360043907793633e-07, + "loss": 1.1354, + "mean_token_accuracy": 0.6750014424324036, + "num_tokens": 11007354.0, + "step": 431 + }, + { + "epoch": 0.04744124752910169, + "grad_norm": 2.8202121257781982, + "learning_rate": 2.3655323819978045e-07, + "loss": 1.1968, + "mean_token_accuracy": 0.6522764563560486, + "num_tokens": 11035177.0, + "step": 432 + }, + { + "epoch": 0.04755106523171535, + "grad_norm": 3.103504180908203, + "learning_rate": 2.3710208562019757e-07, + "loss": 1.1374, + "mean_token_accuracy": 0.675384521484375, + "num_tokens": 11060576.0, + "step": 433 + }, + { + "epoch": 0.04766088293432901, + "grad_norm": 2.708136558532715, + "learning_rate": 2.376509330406147e-07, + "loss": 1.2218, + "mean_token_accuracy": 0.648571252822876, + "num_tokens": 11091889.0, + "step": 434 + }, + { + "epoch": 0.04777070063694268, + "grad_norm": 3.5192947387695312, + "learning_rate": 2.3819978046103183e-07, + "loss": 1.1594, + "mean_token_accuracy": 0.6644055247306824, + "num_tokens": 11112251.0, + "step": 435 + }, + { + "epoch": 0.04788051833955634, + "grad_norm": 3.635742425918579, + "learning_rate": 2.3874862788144894e-07, + "loss": 1.1188, + "mean_token_accuracy": 0.6772193312644958, + "num_tokens": 11132562.0, + "step": 436 + }, + { + "epoch": 0.04799033604217, + "grad_norm": 2.9484477043151855, + "learning_rate": 2.3929747530186606e-07, + "loss": 1.0948, + "mean_token_accuracy": 0.683571994304657, + "num_tokens": 11161247.0, + "step": 437 + }, + { + "epoch": 0.04810015374478366, + "grad_norm": 3.95388126373291, + "learning_rate": 2.3984632272228317e-07, + "loss": 1.2085, + "mean_token_accuracy": 0.6526623964309692, + "num_tokens": 11181677.0, + "step": 438 + }, + { + "epoch": 0.04820997144739732, + "grad_norm": 2.9874746799468994, + "learning_rate": 2.4039517014270034e-07, + "loss": 1.168, + "mean_token_accuracy": 0.6574394702911377, + "num_tokens": 11210446.0, + "step": 439 + }, + { + "epoch": 0.048319789150010985, + "grad_norm": 4.021981716156006, + "learning_rate": 2.4094401756311745e-07, + "loss": 1.2056, + "mean_token_accuracy": 0.6532331109046936, + "num_tokens": 11229930.0, + "step": 440 + }, + { + "epoch": 0.048429606852624645, + "grad_norm": 3.064275026321411, + "learning_rate": 2.4149286498353457e-07, + "loss": 1.182, + "mean_token_accuracy": 0.6565237045288086, + "num_tokens": 11259473.0, + "step": 441 + }, + { + "epoch": 0.048539424555238304, + "grad_norm": 3.449357271194458, + "learning_rate": 2.420417124039517e-07, + "loss": 1.1204, + "mean_token_accuracy": 0.6738406419754028, + "num_tokens": 11281256.0, + "step": 442 + }, + { + "epoch": 0.048649242257851964, + "grad_norm": 3.4341084957122803, + "learning_rate": 2.425905598243688e-07, + "loss": 1.2259, + "mean_token_accuracy": 0.6489056944847107, + "num_tokens": 11311863.0, + "step": 443 + }, + { + "epoch": 0.048759059960465624, + "grad_norm": 2.87770676612854, + "learning_rate": 2.431394072447859e-07, + "loss": 1.1441, + "mean_token_accuracy": 0.6744784116744995, + "num_tokens": 11339689.0, + "step": 444 + }, + { + "epoch": 0.04886887766307929, + "grad_norm": 3.3231570720672607, + "learning_rate": 2.436882546652031e-07, + "loss": 1.1482, + "mean_token_accuracy": 0.6720186471939087, + "num_tokens": 11363126.0, + "step": 445 + }, + { + "epoch": 0.04897869536569295, + "grad_norm": 3.164910316467285, + "learning_rate": 2.442371020856202e-07, + "loss": 1.0678, + "mean_token_accuracy": 0.6877599954605103, + "num_tokens": 11386199.0, + "step": 446 + }, + { + "epoch": 0.04908851306830661, + "grad_norm": 3.9275858402252197, + "learning_rate": 2.447859495060373e-07, + "loss": 1.1678, + "mean_token_accuracy": 0.6606632471084595, + "num_tokens": 11405947.0, + "step": 447 + }, + { + "epoch": 0.04919833077092027, + "grad_norm": 3.5663182735443115, + "learning_rate": 2.4533479692645443e-07, + "loss": 1.1599, + "mean_token_accuracy": 0.6598899364471436, + "num_tokens": 11427573.0, + "step": 448 + }, + { + "epoch": 0.04930814847353393, + "grad_norm": 3.1259799003601074, + "learning_rate": 2.4588364434687154e-07, + "loss": 1.1436, + "mean_token_accuracy": 0.6703786849975586, + "num_tokens": 11452316.0, + "step": 449 + }, + { + "epoch": 0.0494179661761476, + "grad_norm": 3.258435010910034, + "learning_rate": 2.4643249176728866e-07, + "loss": 1.1263, + "mean_token_accuracy": 0.6795697212219238, + "num_tokens": 11479364.0, + "step": 450 + }, + { + "epoch": 0.04952778387876126, + "grad_norm": 3.568202018737793, + "learning_rate": 2.4698133918770583e-07, + "loss": 1.2086, + "mean_token_accuracy": 0.6542370319366455, + "num_tokens": 11501312.0, + "step": 451 + }, + { + "epoch": 0.04963760158137492, + "grad_norm": 3.5061707496643066, + "learning_rate": 2.4753018660812294e-07, + "loss": 1.1333, + "mean_token_accuracy": 0.670478880405426, + "num_tokens": 11524934.0, + "step": 452 + }, + { + "epoch": 0.04974741928398858, + "grad_norm": 3.099993944168091, + "learning_rate": 2.4807903402854006e-07, + "loss": 1.18, + "mean_token_accuracy": 0.6570611000061035, + "num_tokens": 11550065.0, + "step": 453 + }, + { + "epoch": 0.049857236986602244, + "grad_norm": 3.7785274982452393, + "learning_rate": 2.486278814489572e-07, + "loss": 1.1286, + "mean_token_accuracy": 0.6830159425735474, + "num_tokens": 11569006.0, + "step": 454 + }, + { + "epoch": 0.0499670546892159, + "grad_norm": 3.5797572135925293, + "learning_rate": 2.491767288693743e-07, + "loss": 1.0843, + "mean_token_accuracy": 0.6799170970916748, + "num_tokens": 11589840.0, + "step": 455 + }, + { + "epoch": 0.05007687239182956, + "grad_norm": 3.275390625, + "learning_rate": 2.4972557628979146e-07, + "loss": 1.2505, + "mean_token_accuracy": 0.6413843631744385, + "num_tokens": 11619845.0, + "step": 456 + }, + { + "epoch": 0.05018669009444322, + "grad_norm": 2.9447102546691895, + "learning_rate": 2.5027442371020857e-07, + "loss": 1.1425, + "mean_token_accuracy": 0.6873313784599304, + "num_tokens": 11647133.0, + "step": 457 + }, + { + "epoch": 0.05029650779705688, + "grad_norm": 3.1494295597076416, + "learning_rate": 2.508232711306257e-07, + "loss": 1.1854, + "mean_token_accuracy": 0.6601170897483826, + "num_tokens": 11672280.0, + "step": 458 + }, + { + "epoch": 0.05040632549967055, + "grad_norm": 2.8879780769348145, + "learning_rate": 2.513721185510428e-07, + "loss": 1.1535, + "mean_token_accuracy": 0.6772384643554688, + "num_tokens": 11700110.0, + "step": 459 + }, + { + "epoch": 0.05051614320228421, + "grad_norm": 2.6110713481903076, + "learning_rate": 2.519209659714599e-07, + "loss": 1.2153, + "mean_token_accuracy": 0.6516492962837219, + "num_tokens": 11734749.0, + "step": 460 + }, + { + "epoch": 0.05062596090489787, + "grad_norm": 2.872546672821045, + "learning_rate": 2.524698133918771e-07, + "loss": 1.098, + "mean_token_accuracy": 0.6789885759353638, + "num_tokens": 11764187.0, + "step": 461 + }, + { + "epoch": 0.05073577860751153, + "grad_norm": 3.281914710998535, + "learning_rate": 2.5301866081229415e-07, + "loss": 1.1829, + "mean_token_accuracy": 0.6598992943763733, + "num_tokens": 11790829.0, + "step": 462 + }, + { + "epoch": 0.05084559631012519, + "grad_norm": 2.737445592880249, + "learning_rate": 2.5356750823271126e-07, + "loss": 1.1316, + "mean_token_accuracy": 0.6721787452697754, + "num_tokens": 11819394.0, + "step": 463 + }, + { + "epoch": 0.050955414012738856, + "grad_norm": 3.1093804836273193, + "learning_rate": 2.5411635565312843e-07, + "loss": 1.1527, + "mean_token_accuracy": 0.6667920351028442, + "num_tokens": 11843970.0, + "step": 464 + }, + { + "epoch": 0.051065231715352516, + "grad_norm": 3.0568134784698486, + "learning_rate": 2.5466520307354555e-07, + "loss": 1.1185, + "mean_token_accuracy": 0.6807675957679749, + "num_tokens": 11874828.0, + "step": 465 + }, + { + "epoch": 0.051175049417966176, + "grad_norm": 3.308863639831543, + "learning_rate": 2.5521405049396266e-07, + "loss": 1.104, + "mean_token_accuracy": 0.6772963404655457, + "num_tokens": 11895980.0, + "step": 466 + }, + { + "epoch": 0.051284867120579836, + "grad_norm": 3.677149534225464, + "learning_rate": 2.557628979143798e-07, + "loss": 1.1245, + "mean_token_accuracy": 0.6739895939826965, + "num_tokens": 11918735.0, + "step": 467 + }, + { + "epoch": 0.051394684823193496, + "grad_norm": 3.1102800369262695, + "learning_rate": 2.5631174533479695e-07, + "loss": 1.026, + "mean_token_accuracy": 0.6972888112068176, + "num_tokens": 11942401.0, + "step": 468 + }, + { + "epoch": 0.05150450252580716, + "grad_norm": 3.0361979007720947, + "learning_rate": 2.5686059275521406e-07, + "loss": 1.1668, + "mean_token_accuracy": 0.6661399602890015, + "num_tokens": 11967051.0, + "step": 469 + }, + { + "epoch": 0.05161432022842082, + "grad_norm": 2.8133976459503174, + "learning_rate": 2.574094401756312e-07, + "loss": 1.1695, + "mean_token_accuracy": 0.6673774123191833, + "num_tokens": 11992977.0, + "step": 470 + }, + { + "epoch": 0.05172413793103448, + "grad_norm": 2.8034746646881104, + "learning_rate": 2.5795828759604824e-07, + "loss": 1.3122, + "mean_token_accuracy": 0.6298360228538513, + "num_tokens": 12023459.0, + "step": 471 + }, + { + "epoch": 0.05183395563364814, + "grad_norm": 3.2314271926879883, + "learning_rate": 2.585071350164654e-07, + "loss": 1.1429, + "mean_token_accuracy": 0.6640405654907227, + "num_tokens": 12045185.0, + "step": 472 + }, + { + "epoch": 0.05194377333626181, + "grad_norm": 2.9093284606933594, + "learning_rate": 2.590559824368825e-07, + "loss": 1.0853, + "mean_token_accuracy": 0.6841162443161011, + "num_tokens": 12070392.0, + "step": 473 + }, + { + "epoch": 0.05205359103887547, + "grad_norm": 3.0238430500030518, + "learning_rate": 2.5960482985729964e-07, + "loss": 1.1178, + "mean_token_accuracy": 0.6764590740203857, + "num_tokens": 12092387.0, + "step": 474 + }, + { + "epoch": 0.05216340874148913, + "grad_norm": 2.7613296508789062, + "learning_rate": 2.601536772777168e-07, + "loss": 1.1516, + "mean_token_accuracy": 0.6748732328414917, + "num_tokens": 12121010.0, + "step": 475 + }, + { + "epoch": 0.05227322644410279, + "grad_norm": 2.8923678398132324, + "learning_rate": 2.607025246981339e-07, + "loss": 1.1573, + "mean_token_accuracy": 0.6678214073181152, + "num_tokens": 12148143.0, + "step": 476 + }, + { + "epoch": 0.05238304414671645, + "grad_norm": 3.095031261444092, + "learning_rate": 2.6125137211855104e-07, + "loss": 1.212, + "mean_token_accuracy": 0.6627805233001709, + "num_tokens": 12174030.0, + "step": 477 + }, + { + "epoch": 0.052492861849330115, + "grad_norm": 3.1414568424224854, + "learning_rate": 2.6180021953896815e-07, + "loss": 1.1003, + "mean_token_accuracy": 0.6828087568283081, + "num_tokens": 12195607.0, + "step": 478 + }, + { + "epoch": 0.052602679551943775, + "grad_norm": 3.085801124572754, + "learning_rate": 2.623490669593853e-07, + "loss": 1.2352, + "mean_token_accuracy": 0.6476937532424927, + "num_tokens": 12220170.0, + "step": 479 + }, + { + "epoch": 0.052712497254557435, + "grad_norm": 2.8315844535827637, + "learning_rate": 2.6289791437980244e-07, + "loss": 1.0659, + "mean_token_accuracy": 0.6877686381340027, + "num_tokens": 12244645.0, + "step": 480 + }, + { + "epoch": 0.052822314957171095, + "grad_norm": 2.9887163639068604, + "learning_rate": 2.634467618002195e-07, + "loss": 1.0789, + "mean_token_accuracy": 0.6914464235305786, + "num_tokens": 12269219.0, + "step": 481 + }, + { + "epoch": 0.052932132659784754, + "grad_norm": 2.827867031097412, + "learning_rate": 2.639956092206366e-07, + "loss": 1.0337, + "mean_token_accuracy": 0.695317268371582, + "num_tokens": 12295415.0, + "step": 482 + }, + { + "epoch": 0.05304195036239842, + "grad_norm": 2.4254908561706543, + "learning_rate": 2.645444566410538e-07, + "loss": 1.0655, + "mean_token_accuracy": 0.6835272312164307, + "num_tokens": 12325604.0, + "step": 483 + }, + { + "epoch": 0.05315176806501208, + "grad_norm": 2.6219310760498047, + "learning_rate": 2.650933040614709e-07, + "loss": 1.276, + "mean_token_accuracy": 0.6420968770980835, + "num_tokens": 12355338.0, + "step": 484 + }, + { + "epoch": 0.05326158576762574, + "grad_norm": 3.5068368911743164, + "learning_rate": 2.65642151481888e-07, + "loss": 1.1668, + "mean_token_accuracy": 0.663312554359436, + "num_tokens": 12375729.0, + "step": 485 + }, + { + "epoch": 0.0533714034702394, + "grad_norm": 3.0866293907165527, + "learning_rate": 2.6619099890230513e-07, + "loss": 1.2364, + "mean_token_accuracy": 0.6499518156051636, + "num_tokens": 12400570.0, + "step": 486 + }, + { + "epoch": 0.05348122117285306, + "grad_norm": 3.0521810054779053, + "learning_rate": 2.667398463227223e-07, + "loss": 1.0715, + "mean_token_accuracy": 0.6819441318511963, + "num_tokens": 12423152.0, + "step": 487 + }, + { + "epoch": 0.05359103887546673, + "grad_norm": 2.6561851501464844, + "learning_rate": 2.672886937431394e-07, + "loss": 1.1211, + "mean_token_accuracy": 0.6778345704078674, + "num_tokens": 12452251.0, + "step": 488 + }, + { + "epoch": 0.05370085657808039, + "grad_norm": 3.052508592605591, + "learning_rate": 2.678375411635565e-07, + "loss": 1.091, + "mean_token_accuracy": 0.677112877368927, + "num_tokens": 12476209.0, + "step": 489 + }, + { + "epoch": 0.05381067428069405, + "grad_norm": 2.938105821609497, + "learning_rate": 2.6838638858397364e-07, + "loss": 1.1012, + "mean_token_accuracy": 0.6758878231048584, + "num_tokens": 12498645.0, + "step": 490 + }, + { + "epoch": 0.05392049198330771, + "grad_norm": 2.979836940765381, + "learning_rate": 2.6893523600439076e-07, + "loss": 1.1452, + "mean_token_accuracy": 0.6702375411987305, + "num_tokens": 12523270.0, + "step": 491 + }, + { + "epoch": 0.054030309685921374, + "grad_norm": 3.3953850269317627, + "learning_rate": 2.6948408342480787e-07, + "loss": 1.1158, + "mean_token_accuracy": 0.6718460321426392, + "num_tokens": 12546459.0, + "step": 492 + }, + { + "epoch": 0.054140127388535034, + "grad_norm": 2.9087178707122803, + "learning_rate": 2.70032930845225e-07, + "loss": 1.084, + "mean_token_accuracy": 0.680304765701294, + "num_tokens": 12573566.0, + "step": 493 + }, + { + "epoch": 0.054249945091148694, + "grad_norm": 2.9465956687927246, + "learning_rate": 2.7058177826564215e-07, + "loss": 1.1863, + "mean_token_accuracy": 0.6642053127288818, + "num_tokens": 12600534.0, + "step": 494 + }, + { + "epoch": 0.05435976279376235, + "grad_norm": 3.180112838745117, + "learning_rate": 2.7113062568605927e-07, + "loss": 1.1156, + "mean_token_accuracy": 0.6750054359436035, + "num_tokens": 12623381.0, + "step": 495 + }, + { + "epoch": 0.05446958049637601, + "grad_norm": 2.808340311050415, + "learning_rate": 2.716794731064764e-07, + "loss": 1.1583, + "mean_token_accuracy": 0.6674495935440063, + "num_tokens": 12650685.0, + "step": 496 + }, + { + "epoch": 0.05457939819898968, + "grad_norm": 3.2020187377929688, + "learning_rate": 2.722283205268935e-07, + "loss": 1.0725, + "mean_token_accuracy": 0.6871051788330078, + "num_tokens": 12671632.0, + "step": 497 + }, + { + "epoch": 0.05468921590160334, + "grad_norm": 2.6555123329162598, + "learning_rate": 2.7277716794731067e-07, + "loss": 1.2229, + "mean_token_accuracy": 0.6534762382507324, + "num_tokens": 12700060.0, + "step": 498 + }, + { + "epoch": 0.054799033604217, + "grad_norm": 2.764248847961426, + "learning_rate": 2.733260153677278e-07, + "loss": 1.1826, + "mean_token_accuracy": 0.658277690410614, + "num_tokens": 12729051.0, + "step": 499 + }, + { + "epoch": 0.05490885130683066, + "grad_norm": 3.131464958190918, + "learning_rate": 2.7387486278814485e-07, + "loss": 1.1079, + "mean_token_accuracy": 0.6732529997825623, + "num_tokens": 12750737.0, + "step": 500 + }, + { + "epoch": 0.05501866900944432, + "grad_norm": 2.6778931617736816, + "learning_rate": 2.74423710208562e-07, + "loss": 1.1164, + "mean_token_accuracy": 0.6747087240219116, + "num_tokens": 12775316.0, + "step": 501 + }, + { + "epoch": 0.055128486712057986, + "grad_norm": 2.442354917526245, + "learning_rate": 2.7497255762897913e-07, + "loss": 1.2191, + "mean_token_accuracy": 0.6500983238220215, + "num_tokens": 12805952.0, + "step": 502 + }, + { + "epoch": 0.055238304414671646, + "grad_norm": 3.113615036010742, + "learning_rate": 2.7552140504939624e-07, + "loss": 1.1657, + "mean_token_accuracy": 0.6551817655563354, + "num_tokens": 12826677.0, + "step": 503 + }, + { + "epoch": 0.055348122117285306, + "grad_norm": 2.5551652908325195, + "learning_rate": 2.7607025246981336e-07, + "loss": 1.1121, + "mean_token_accuracy": 0.6774107813835144, + "num_tokens": 12854738.0, + "step": 504 + }, + { + "epoch": 0.055457939819898966, + "grad_norm": 3.1680080890655518, + "learning_rate": 2.7661909989023053e-07, + "loss": 1.0284, + "mean_token_accuracy": 0.6967213153839111, + "num_tokens": 12875948.0, + "step": 505 + }, + { + "epoch": 0.055567757522512626, + "grad_norm": 2.8096299171447754, + "learning_rate": 2.7716794731064764e-07, + "loss": 1.1787, + "mean_token_accuracy": 0.6655986309051514, + "num_tokens": 12903910.0, + "step": 506 + }, + { + "epoch": 0.05567757522512629, + "grad_norm": 2.932955026626587, + "learning_rate": 2.7771679473106476e-07, + "loss": 1.0884, + "mean_token_accuracy": 0.6831083297729492, + "num_tokens": 12927043.0, + "step": 507 + }, + { + "epoch": 0.05578739292773995, + "grad_norm": 3.123605728149414, + "learning_rate": 2.7826564215148187e-07, + "loss": 1.1531, + "mean_token_accuracy": 0.6676799058914185, + "num_tokens": 12948425.0, + "step": 508 + }, + { + "epoch": 0.05589721063035361, + "grad_norm": 2.8389883041381836, + "learning_rate": 2.78814489571899e-07, + "loss": 1.0207, + "mean_token_accuracy": 0.6915480494499207, + "num_tokens": 12969886.0, + "step": 509 + }, + { + "epoch": 0.05600702833296727, + "grad_norm": 3.0248024463653564, + "learning_rate": 2.793633369923161e-07, + "loss": 1.0332, + "mean_token_accuracy": 0.6993164420127869, + "num_tokens": 12989628.0, + "step": 510 + }, + { + "epoch": 0.05611684603558094, + "grad_norm": 2.7176456451416016, + "learning_rate": 2.799121844127332e-07, + "loss": 1.1719, + "mean_token_accuracy": 0.6630688309669495, + "num_tokens": 13015076.0, + "step": 511 + }, + { + "epoch": 0.0562266637381946, + "grad_norm": 3.0295066833496094, + "learning_rate": 2.804610318331504e-07, + "loss": 1.1259, + "mean_token_accuracy": 0.6758173704147339, + "num_tokens": 13037272.0, + "step": 512 + }, + { + "epoch": 0.05633648144080826, + "grad_norm": 3.0940208435058594, + "learning_rate": 2.810098792535675e-07, + "loss": 1.0498, + "mean_token_accuracy": 0.6918799877166748, + "num_tokens": 13058479.0, + "step": 513 + }, + { + "epoch": 0.05644629914342192, + "grad_norm": 3.5650055408477783, + "learning_rate": 2.815587266739846e-07, + "loss": 1.0979, + "mean_token_accuracy": 0.6782865524291992, + "num_tokens": 13076011.0, + "step": 514 + }, + { + "epoch": 0.05655611684603558, + "grad_norm": 3.1170315742492676, + "learning_rate": 2.8210757409440173e-07, + "loss": 1.0899, + "mean_token_accuracy": 0.6827423572540283, + "num_tokens": 13098518.0, + "step": 515 + }, + { + "epoch": 0.056665934548649245, + "grad_norm": 2.686107873916626, + "learning_rate": 2.826564215148189e-07, + "loss": 1.1915, + "mean_token_accuracy": 0.6528488993644714, + "num_tokens": 13125949.0, + "step": 516 + }, + { + "epoch": 0.056775752251262905, + "grad_norm": 2.9495038986206055, + "learning_rate": 2.83205268935236e-07, + "loss": 1.0723, + "mean_token_accuracy": 0.6948891282081604, + "num_tokens": 13148723.0, + "step": 517 + }, + { + "epoch": 0.056885569953876565, + "grad_norm": 2.532296657562256, + "learning_rate": 2.8375411635565313e-07, + "loss": 1.12, + "mean_token_accuracy": 0.6755173206329346, + "num_tokens": 13180119.0, + "step": 518 + }, + { + "epoch": 0.056995387656490225, + "grad_norm": 2.719142198562622, + "learning_rate": 2.843029637760702e-07, + "loss": 1.182, + "mean_token_accuracy": 0.6604490280151367, + "num_tokens": 13206324.0, + "step": 519 + }, + { + "epoch": 0.057105205359103885, + "grad_norm": 2.664639949798584, + "learning_rate": 2.8485181119648736e-07, + "loss": 1.0209, + "mean_token_accuracy": 0.6982862949371338, + "num_tokens": 13232713.0, + "step": 520 + }, + { + "epoch": 0.05721502306171755, + "grad_norm": 2.6891977787017822, + "learning_rate": 2.854006586169045e-07, + "loss": 1.1189, + "mean_token_accuracy": 0.6767873764038086, + "num_tokens": 13258620.0, + "step": 521 + }, + { + "epoch": 0.05732484076433121, + "grad_norm": 2.4648597240448, + "learning_rate": 2.859495060373216e-07, + "loss": 1.2087, + "mean_token_accuracy": 0.6588999032974243, + "num_tokens": 13288082.0, + "step": 522 + }, + { + "epoch": 0.05743465846694487, + "grad_norm": 2.683687686920166, + "learning_rate": 2.8649835345773876e-07, + "loss": 1.2053, + "mean_token_accuracy": 0.6632301211357117, + "num_tokens": 13312561.0, + "step": 523 + }, + { + "epoch": 0.05754447616955853, + "grad_norm": 2.927563428878784, + "learning_rate": 2.870472008781559e-07, + "loss": 1.1561, + "mean_token_accuracy": 0.6610163450241089, + "num_tokens": 13334364.0, + "step": 524 + }, + { + "epoch": 0.05765429387217219, + "grad_norm": 2.7845377922058105, + "learning_rate": 2.87596048298573e-07, + "loss": 1.1811, + "mean_token_accuracy": 0.6539901494979858, + "num_tokens": 13359447.0, + "step": 525 + }, + { + "epoch": 0.05776411157478586, + "grad_norm": 3.198659658432007, + "learning_rate": 2.881448957189901e-07, + "loss": 1.1017, + "mean_token_accuracy": 0.6641305088996887, + "num_tokens": 13379219.0, + "step": 526 + }, + { + "epoch": 0.05787392927739952, + "grad_norm": 2.8018789291381836, + "learning_rate": 2.886937431394073e-07, + "loss": 1.1347, + "mean_token_accuracy": 0.6664095520973206, + "num_tokens": 13402415.0, + "step": 527 + }, + { + "epoch": 0.05798374698001318, + "grad_norm": 2.6917154788970947, + "learning_rate": 2.8924259055982434e-07, + "loss": 1.1477, + "mean_token_accuracy": 0.6792097091674805, + "num_tokens": 13427288.0, + "step": 528 + }, + { + "epoch": 0.05809356468262684, + "grad_norm": 3.2671566009521484, + "learning_rate": 2.8979143798024145e-07, + "loss": 1.1068, + "mean_token_accuracy": 0.6803802847862244, + "num_tokens": 13446124.0, + "step": 529 + }, + { + "epoch": 0.058203382385240504, + "grad_norm": 2.8098700046539307, + "learning_rate": 2.9034028540065857e-07, + "loss": 1.1143, + "mean_token_accuracy": 0.670087456703186, + "num_tokens": 13470655.0, + "step": 530 + }, + { + "epoch": 0.058313200087854164, + "grad_norm": 2.7658305168151855, + "learning_rate": 2.9088913282107574e-07, + "loss": 1.0779, + "mean_token_accuracy": 0.6859239339828491, + "num_tokens": 13495727.0, + "step": 531 + }, + { + "epoch": 0.058423017790467824, + "grad_norm": 2.7630887031555176, + "learning_rate": 2.9143798024149285e-07, + "loss": 1.1464, + "mean_token_accuracy": 0.6739048957824707, + "num_tokens": 13521815.0, + "step": 532 + }, + { + "epoch": 0.058532835493081484, + "grad_norm": 2.729121208190918, + "learning_rate": 2.9198682766190997e-07, + "loss": 1.0549, + "mean_token_accuracy": 0.6850112080574036, + "num_tokens": 13547967.0, + "step": 533 + }, + { + "epoch": 0.058642653195695144, + "grad_norm": 3.0382723808288574, + "learning_rate": 2.9253567508232713e-07, + "loss": 1.004, + "mean_token_accuracy": 0.7035975456237793, + "num_tokens": 13567594.0, + "step": 534 + }, + { + "epoch": 0.05875247089830881, + "grad_norm": 3.29567289352417, + "learning_rate": 2.9308452250274425e-07, + "loss": 1.1167, + "mean_token_accuracy": 0.676245927810669, + "num_tokens": 13585477.0, + "step": 535 + }, + { + "epoch": 0.05886228860092247, + "grad_norm": 2.891552686691284, + "learning_rate": 2.9363336992316136e-07, + "loss": 1.109, + "mean_token_accuracy": 0.6777382493019104, + "num_tokens": 13608609.0, + "step": 536 + }, + { + "epoch": 0.05897210630353613, + "grad_norm": 2.973256826400757, + "learning_rate": 2.941822173435785e-07, + "loss": 1.0498, + "mean_token_accuracy": 0.6890304684638977, + "num_tokens": 13629465.0, + "step": 537 + }, + { + "epoch": 0.05908192400614979, + "grad_norm": 2.3668549060821533, + "learning_rate": 2.947310647639956e-07, + "loss": 1.1112, + "mean_token_accuracy": 0.6803209781646729, + "num_tokens": 13661471.0, + "step": 538 + }, + { + "epoch": 0.05919174170876345, + "grad_norm": 2.305104970932007, + "learning_rate": 2.952799121844127e-07, + "loss": 1.1775, + "mean_token_accuracy": 0.6555168032646179, + "num_tokens": 13692471.0, + "step": 539 + }, + { + "epoch": 0.05930155941137712, + "grad_norm": 2.5830087661743164, + "learning_rate": 2.958287596048298e-07, + "loss": 1.206, + "mean_token_accuracy": 0.6526113748550415, + "num_tokens": 13721511.0, + "step": 540 + }, + { + "epoch": 0.059411377113990776, + "grad_norm": 2.8340628147125244, + "learning_rate": 2.9637760702524694e-07, + "loss": 1.1842, + "mean_token_accuracy": 0.6607651114463806, + "num_tokens": 13745211.0, + "step": 541 + }, + { + "epoch": 0.059521194816604436, + "grad_norm": 2.5016112327575684, + "learning_rate": 2.969264544456641e-07, + "loss": 1.0964, + "mean_token_accuracy": 0.6832762956619263, + "num_tokens": 13772333.0, + "step": 542 + }, + { + "epoch": 0.059631012519218096, + "grad_norm": 2.67874813079834, + "learning_rate": 2.974753018660812e-07, + "loss": 1.1196, + "mean_token_accuracy": 0.6834762096405029, + "num_tokens": 13798208.0, + "step": 543 + }, + { + "epoch": 0.059740830221831756, + "grad_norm": 2.4577724933624268, + "learning_rate": 2.9802414928649834e-07, + "loss": 1.1788, + "mean_token_accuracy": 0.660507082939148, + "num_tokens": 13827937.0, + "step": 544 + }, + { + "epoch": 0.05985064792444542, + "grad_norm": 2.672624349594116, + "learning_rate": 2.9857299670691546e-07, + "loss": 1.1881, + "mean_token_accuracy": 0.6572923064231873, + "num_tokens": 13853408.0, + "step": 545 + }, + { + "epoch": 0.05996046562705908, + "grad_norm": 2.5474061965942383, + "learning_rate": 2.991218441273326e-07, + "loss": 1.1014, + "mean_token_accuracy": 0.6762779951095581, + "num_tokens": 13882677.0, + "step": 546 + }, + { + "epoch": 0.06007028332967274, + "grad_norm": 2.463909864425659, + "learning_rate": 2.996706915477497e-07, + "loss": 1.1176, + "mean_token_accuracy": 0.6747644543647766, + "num_tokens": 13911710.0, + "step": 547 + }, + { + "epoch": 0.0601801010322864, + "grad_norm": 2.262143611907959, + "learning_rate": 3.002195389681668e-07, + "loss": 1.2455, + "mean_token_accuracy": 0.6390523910522461, + "num_tokens": 13948777.0, + "step": 548 + }, + { + "epoch": 0.06028991873490007, + "grad_norm": 2.4843969345092773, + "learning_rate": 3.0076838638858397e-07, + "loss": 1.1064, + "mean_token_accuracy": 0.6752959489822388, + "num_tokens": 13977129.0, + "step": 549 + }, + { + "epoch": 0.06039973643751373, + "grad_norm": 2.6690175533294678, + "learning_rate": 3.013172338090011e-07, + "loss": 1.1451, + "mean_token_accuracy": 0.6668739318847656, + "num_tokens": 14005429.0, + "step": 550 + }, + { + "epoch": 0.06050955414012739, + "grad_norm": 2.5957396030426025, + "learning_rate": 3.018660812294182e-07, + "loss": 1.0822, + "mean_token_accuracy": 0.6810079216957092, + "num_tokens": 14033772.0, + "step": 551 + }, + { + "epoch": 0.06061937184274105, + "grad_norm": 2.6999850273132324, + "learning_rate": 3.024149286498353e-07, + "loss": 1.1818, + "mean_token_accuracy": 0.6581510901451111, + "num_tokens": 14059174.0, + "step": 552 + }, + { + "epoch": 0.06072918954535471, + "grad_norm": 2.541529417037964, + "learning_rate": 3.029637760702525e-07, + "loss": 1.1036, + "mean_token_accuracy": 0.6821290254592896, + "num_tokens": 14090278.0, + "step": 553 + }, + { + "epoch": 0.060839007247968376, + "grad_norm": 2.4281489849090576, + "learning_rate": 3.035126234906696e-07, + "loss": 1.2258, + "mean_token_accuracy": 0.6517138481140137, + "num_tokens": 14123305.0, + "step": 554 + }, + { + "epoch": 0.060948824950582035, + "grad_norm": 2.676240921020508, + "learning_rate": 3.040614709110867e-07, + "loss": 1.1612, + "mean_token_accuracy": 0.6636530160903931, + "num_tokens": 14148921.0, + "step": 555 + }, + { + "epoch": 0.061058642653195695, + "grad_norm": 3.0314180850982666, + "learning_rate": 3.046103183315038e-07, + "loss": 1.1553, + "mean_token_accuracy": 0.6710172891616821, + "num_tokens": 14168906.0, + "step": 556 + }, + { + "epoch": 0.061168460355809355, + "grad_norm": 2.6786983013153076, + "learning_rate": 3.0515916575192094e-07, + "loss": 1.1206, + "mean_token_accuracy": 0.6777726411819458, + "num_tokens": 14191863.0, + "step": 557 + }, + { + "epoch": 0.061278278058423015, + "grad_norm": 2.5220839977264404, + "learning_rate": 3.0570801317233806e-07, + "loss": 1.1412, + "mean_token_accuracy": 0.6709957718849182, + "num_tokens": 14220102.0, + "step": 558 + }, + { + "epoch": 0.06138809576103668, + "grad_norm": 2.5822012424468994, + "learning_rate": 3.062568605927552e-07, + "loss": 1.0452, + "mean_token_accuracy": 0.6914668083190918, + "num_tokens": 14247116.0, + "step": 559 + }, + { + "epoch": 0.06149791346365034, + "grad_norm": 2.9499566555023193, + "learning_rate": 3.0680570801317234e-07, + "loss": 1.1254, + "mean_token_accuracy": 0.6720037460327148, + "num_tokens": 14271541.0, + "step": 560 + }, + { + "epoch": 0.061607731166264, + "grad_norm": 2.586855173110962, + "learning_rate": 3.0735455543358946e-07, + "loss": 1.2017, + "mean_token_accuracy": 0.6657958030700684, + "num_tokens": 14296436.0, + "step": 561 + }, + { + "epoch": 0.06171754886887766, + "grad_norm": 2.1363308429718018, + "learning_rate": 3.0790340285400657e-07, + "loss": 1.0964, + "mean_token_accuracy": 0.6822270154953003, + "num_tokens": 14333264.0, + "step": 562 + }, + { + "epoch": 0.06182736657149132, + "grad_norm": 2.760403633117676, + "learning_rate": 3.084522502744237e-07, + "loss": 1.106, + "mean_token_accuracy": 0.6756364703178406, + "num_tokens": 14355894.0, + "step": 563 + }, + { + "epoch": 0.06193718427410499, + "grad_norm": 2.311912775039673, + "learning_rate": 3.0900109769484086e-07, + "loss": 1.2435, + "mean_token_accuracy": 0.6425210237503052, + "num_tokens": 14391095.0, + "step": 564 + }, + { + "epoch": 0.06204700197671865, + "grad_norm": 2.760313034057617, + "learning_rate": 3.0954994511525797e-07, + "loss": 1.0791, + "mean_token_accuracy": 0.6763796806335449, + "num_tokens": 14416548.0, + "step": 565 + }, + { + "epoch": 0.06215681967933231, + "grad_norm": 3.200000762939453, + "learning_rate": 3.1009879253567503e-07, + "loss": 1.1368, + "mean_token_accuracy": 0.6725000143051147, + "num_tokens": 14437879.0, + "step": 566 + }, + { + "epoch": 0.06226663738194597, + "grad_norm": 2.1694140434265137, + "learning_rate": 3.1064763995609215e-07, + "loss": 1.1635, + "mean_token_accuracy": 0.6688697934150696, + "num_tokens": 14473407.0, + "step": 567 + }, + { + "epoch": 0.062376455084559634, + "grad_norm": 2.330894708633423, + "learning_rate": 3.111964873765093e-07, + "loss": 1.1548, + "mean_token_accuracy": 0.6648834347724915, + "num_tokens": 14503738.0, + "step": 568 + }, + { + "epoch": 0.062486272787173294, + "grad_norm": 2.7172670364379883, + "learning_rate": 3.1174533479692643e-07, + "loss": 1.1007, + "mean_token_accuracy": 0.6811221241950989, + "num_tokens": 14526573.0, + "step": 569 + }, + { + "epoch": 0.06259609048978695, + "grad_norm": 2.214191436767578, + "learning_rate": 3.1229418221734355e-07, + "loss": 1.2196, + "mean_token_accuracy": 0.6514593362808228, + "num_tokens": 14560975.0, + "step": 570 + }, + { + "epoch": 0.06270590819240061, + "grad_norm": 3.3355789184570312, + "learning_rate": 3.128430296377607e-07, + "loss": 0.9586, + "mean_token_accuracy": 0.7100955247879028, + "num_tokens": 14579176.0, + "step": 571 + }, + { + "epoch": 0.06281572589501427, + "grad_norm": 2.736811876296997, + "learning_rate": 3.1339187705817783e-07, + "loss": 1.1553, + "mean_token_accuracy": 0.6536723971366882, + "num_tokens": 14604893.0, + "step": 572 + }, + { + "epoch": 0.06292554359762793, + "grad_norm": 2.9278781414031982, + "learning_rate": 3.1394072447859495e-07, + "loss": 1.0729, + "mean_token_accuracy": 0.6832210421562195, + "num_tokens": 14627076.0, + "step": 573 + }, + { + "epoch": 0.0630353613002416, + "grad_norm": 2.7005088329315186, + "learning_rate": 3.1448957189901206e-07, + "loss": 1.122, + "mean_token_accuracy": 0.676619291305542, + "num_tokens": 14655064.0, + "step": 574 + }, + { + "epoch": 0.06314517900285525, + "grad_norm": 2.529568910598755, + "learning_rate": 3.1503841931942923e-07, + "loss": 1.1344, + "mean_token_accuracy": 0.6697264313697815, + "num_tokens": 14684034.0, + "step": 575 + }, + { + "epoch": 0.06325499670546893, + "grad_norm": 2.935487985610962, + "learning_rate": 3.155872667398463e-07, + "loss": 1.0874, + "mean_token_accuracy": 0.6792458295822144, + "num_tokens": 14705563.0, + "step": 576 + }, + { + "epoch": 0.06336481440808259, + "grad_norm": 2.1406054496765137, + "learning_rate": 3.161361141602634e-07, + "loss": 1.3002, + "mean_token_accuracy": 0.627912700176239, + "num_tokens": 14742080.0, + "step": 577 + }, + { + "epoch": 0.06347463211069625, + "grad_norm": 2.5905377864837646, + "learning_rate": 3.166849615806805e-07, + "loss": 1.0694, + "mean_token_accuracy": 0.6816224455833435, + "num_tokens": 14767676.0, + "step": 578 + }, + { + "epoch": 0.0635844498133099, + "grad_norm": 2.613710403442383, + "learning_rate": 3.172338090010977e-07, + "loss": 1.1835, + "mean_token_accuracy": 0.6615107655525208, + "num_tokens": 14795249.0, + "step": 579 + }, + { + "epoch": 0.06369426751592357, + "grad_norm": 2.7452645301818848, + "learning_rate": 3.177826564215148e-07, + "loss": 1.0843, + "mean_token_accuracy": 0.6845394968986511, + "num_tokens": 14820260.0, + "step": 580 + }, + { + "epoch": 0.06380408521853723, + "grad_norm": 2.9504921436309814, + "learning_rate": 3.183315038419319e-07, + "loss": 1.0957, + "mean_token_accuracy": 0.678625762462616, + "num_tokens": 14842140.0, + "step": 581 + }, + { + "epoch": 0.06391390292115089, + "grad_norm": 2.559813976287842, + "learning_rate": 3.188803512623491e-07, + "loss": 1.2277, + "mean_token_accuracy": 0.642980694770813, + "num_tokens": 14868787.0, + "step": 582 + }, + { + "epoch": 0.06402372062376455, + "grad_norm": 2.761876106262207, + "learning_rate": 3.194291986827662e-07, + "loss": 1.1263, + "mean_token_accuracy": 0.6721449494361877, + "num_tokens": 14891338.0, + "step": 583 + }, + { + "epoch": 0.0641335383263782, + "grad_norm": 2.925356149673462, + "learning_rate": 3.199780461031833e-07, + "loss": 1.1482, + "mean_token_accuracy": 0.6850920915603638, + "num_tokens": 14912968.0, + "step": 584 + }, + { + "epoch": 0.06424335602899188, + "grad_norm": 2.5387139320373535, + "learning_rate": 3.205268935236004e-07, + "loss": 1.2144, + "mean_token_accuracy": 0.6441872715950012, + "num_tokens": 14939875.0, + "step": 585 + }, + { + "epoch": 0.06435317373160554, + "grad_norm": 3.001948833465576, + "learning_rate": 3.2107574094401755e-07, + "loss": 1.1671, + "mean_token_accuracy": 0.6665218472480774, + "num_tokens": 14959796.0, + "step": 586 + }, + { + "epoch": 0.0644629914342192, + "grad_norm": 2.48949933052063, + "learning_rate": 3.2162458836443467e-07, + "loss": 1.0303, + "mean_token_accuracy": 0.7023017406463623, + "num_tokens": 14986652.0, + "step": 587 + }, + { + "epoch": 0.06457280913683286, + "grad_norm": 2.5178232192993164, + "learning_rate": 3.221734357848518e-07, + "loss": 1.1916, + "mean_token_accuracy": 0.6636935472488403, + "num_tokens": 15014598.0, + "step": 588 + }, + { + "epoch": 0.06468262683944652, + "grad_norm": 2.6734466552734375, + "learning_rate": 3.227222832052689e-07, + "loss": 1.1273, + "mean_token_accuracy": 0.6755118370056152, + "num_tokens": 15040795.0, + "step": 589 + }, + { + "epoch": 0.06479244454206018, + "grad_norm": 2.2240850925445557, + "learning_rate": 3.2327113062568606e-07, + "loss": 1.0873, + "mean_token_accuracy": 0.6912739276885986, + "num_tokens": 15071406.0, + "step": 590 + }, + { + "epoch": 0.06490226224467384, + "grad_norm": 2.8684821128845215, + "learning_rate": 3.238199780461032e-07, + "loss": 1.0903, + "mean_token_accuracy": 0.6857134103775024, + "num_tokens": 15092236.0, + "step": 591 + }, + { + "epoch": 0.0650120799472875, + "grad_norm": 2.567096471786499, + "learning_rate": 3.243688254665203e-07, + "loss": 1.1064, + "mean_token_accuracy": 0.6772062182426453, + "num_tokens": 15119634.0, + "step": 592 + }, + { + "epoch": 0.06512189764990116, + "grad_norm": 3.0633580684661865, + "learning_rate": 3.2491767288693746e-07, + "loss": 0.9197, + "mean_token_accuracy": 0.7217950224876404, + "num_tokens": 15137252.0, + "step": 593 + }, + { + "epoch": 0.06523171535251482, + "grad_norm": 2.629080295562744, + "learning_rate": 3.254665203073546e-07, + "loss": 0.998, + "mean_token_accuracy": 0.7046842575073242, + "num_tokens": 15162635.0, + "step": 594 + }, + { + "epoch": 0.06534153305512849, + "grad_norm": 2.7623157501220703, + "learning_rate": 3.2601536772777164e-07, + "loss": 1.1065, + "mean_token_accuracy": 0.6804542541503906, + "num_tokens": 15186154.0, + "step": 595 + }, + { + "epoch": 0.06545135075774215, + "grad_norm": 2.3754212856292725, + "learning_rate": 3.2656421514818876e-07, + "loss": 1.0577, + "mean_token_accuracy": 0.6929589509963989, + "num_tokens": 15218393.0, + "step": 596 + }, + { + "epoch": 0.06556116846035581, + "grad_norm": 2.3875789642333984, + "learning_rate": 3.271130625686059e-07, + "loss": 1.1252, + "mean_token_accuracy": 0.675147294998169, + "num_tokens": 15246333.0, + "step": 597 + }, + { + "epoch": 0.06567098616296947, + "grad_norm": 2.343518018722534, + "learning_rate": 3.2766190998902304e-07, + "loss": 1.1118, + "mean_token_accuracy": 0.6771448850631714, + "num_tokens": 15277096.0, + "step": 598 + }, + { + "epoch": 0.06578080386558313, + "grad_norm": 2.7442100048065186, + "learning_rate": 3.2821075740944015e-07, + "loss": 1.1387, + "mean_token_accuracy": 0.6755744218826294, + "num_tokens": 15299240.0, + "step": 599 + }, + { + "epoch": 0.06589062156819679, + "grad_norm": 2.7567920684814453, + "learning_rate": 3.2875960482985727e-07, + "loss": 1.0963, + "mean_token_accuracy": 0.6801014542579651, + "num_tokens": 15321196.0, + "step": 600 + }, + { + "epoch": 0.06600043927081045, + "grad_norm": 2.496320962905884, + "learning_rate": 3.2930845225027444e-07, + "loss": 1.2089, + "mean_token_accuracy": 0.6570820808410645, + "num_tokens": 15349870.0, + "step": 601 + }, + { + "epoch": 0.06611025697342411, + "grad_norm": 2.5440642833709717, + "learning_rate": 3.2985729967069155e-07, + "loss": 1.0364, + "mean_token_accuracy": 0.6978580355644226, + "num_tokens": 15374781.0, + "step": 602 + }, + { + "epoch": 0.06622007467603777, + "grad_norm": 2.591243267059326, + "learning_rate": 3.3040614709110867e-07, + "loss": 1.1201, + "mean_token_accuracy": 0.6765735149383545, + "num_tokens": 15399064.0, + "step": 603 + }, + { + "epoch": 0.06632989237865144, + "grad_norm": 2.1405045986175537, + "learning_rate": 3.3095499451152573e-07, + "loss": 1.1167, + "mean_token_accuracy": 0.6746946573257446, + "num_tokens": 15431924.0, + "step": 604 + }, + { + "epoch": 0.0664397100812651, + "grad_norm": 2.8469724655151367, + "learning_rate": 3.315038419319429e-07, + "loss": 1.0771, + "mean_token_accuracy": 0.6785841584205627, + "num_tokens": 15451022.0, + "step": 605 + }, + { + "epoch": 0.06654952778387876, + "grad_norm": 2.5096375942230225, + "learning_rate": 3.3205268935236e-07, + "loss": 1.0489, + "mean_token_accuracy": 0.6980577707290649, + "num_tokens": 15475091.0, + "step": 606 + }, + { + "epoch": 0.06665934548649242, + "grad_norm": 2.2841386795043945, + "learning_rate": 3.3260153677277713e-07, + "loss": 1.1722, + "mean_token_accuracy": 0.6546143293380737, + "num_tokens": 15505208.0, + "step": 607 + }, + { + "epoch": 0.06676916318910608, + "grad_norm": 2.7342302799224854, + "learning_rate": 3.331503841931943e-07, + "loss": 1.1722, + "mean_token_accuracy": 0.662571907043457, + "num_tokens": 15527816.0, + "step": 608 + }, + { + "epoch": 0.06687898089171974, + "grad_norm": 2.5532171726226807, + "learning_rate": 3.336992316136114e-07, + "loss": 1.0548, + "mean_token_accuracy": 0.6864217519760132, + "num_tokens": 15552142.0, + "step": 609 + }, + { + "epoch": 0.0669887985943334, + "grad_norm": 2.252918004989624, + "learning_rate": 3.3424807903402853e-07, + "loss": 1.1345, + "mean_token_accuracy": 0.664836049079895, + "num_tokens": 15583340.0, + "step": 610 + }, + { + "epoch": 0.06709861629694706, + "grad_norm": 2.513214111328125, + "learning_rate": 3.3479692645444564e-07, + "loss": 1.0621, + "mean_token_accuracy": 0.6940460801124573, + "num_tokens": 15609329.0, + "step": 611 + }, + { + "epoch": 0.06720843399956072, + "grad_norm": 2.653505325317383, + "learning_rate": 3.353457738748628e-07, + "loss": 1.0478, + "mean_token_accuracy": 0.6910502910614014, + "num_tokens": 15632131.0, + "step": 612 + }, + { + "epoch": 0.06731825170217438, + "grad_norm": 2.5701401233673096, + "learning_rate": 3.3589462129527993e-07, + "loss": 1.1011, + "mean_token_accuracy": 0.6745952367782593, + "num_tokens": 15657145.0, + "step": 613 + }, + { + "epoch": 0.06742806940478806, + "grad_norm": 2.27404522895813, + "learning_rate": 3.36443468715697e-07, + "loss": 1.086, + "mean_token_accuracy": 0.6923003196716309, + "num_tokens": 15687384.0, + "step": 614 + }, + { + "epoch": 0.06753788710740172, + "grad_norm": 2.920088291168213, + "learning_rate": 3.369923161361141e-07, + "loss": 1.1263, + "mean_token_accuracy": 0.6818059086799622, + "num_tokens": 15707189.0, + "step": 615 + }, + { + "epoch": 0.06764770481001538, + "grad_norm": 2.548923969268799, + "learning_rate": 3.3754116355653127e-07, + "loss": 1.0619, + "mean_token_accuracy": 0.693574070930481, + "num_tokens": 15731385.0, + "step": 616 + }, + { + "epoch": 0.06775752251262904, + "grad_norm": 2.8301897048950195, + "learning_rate": 3.380900109769484e-07, + "loss": 1.1502, + "mean_token_accuracy": 0.6654117107391357, + "num_tokens": 15752699.0, + "step": 617 + }, + { + "epoch": 0.0678673402152427, + "grad_norm": 2.586669921875, + "learning_rate": 3.386388583973655e-07, + "loss": 1.0893, + "mean_token_accuracy": 0.6741517782211304, + "num_tokens": 15777412.0, + "step": 618 + }, + { + "epoch": 0.06797715791785636, + "grad_norm": 2.23418927192688, + "learning_rate": 3.3918770581778267e-07, + "loss": 1.1229, + "mean_token_accuracy": 0.6660781502723694, + "num_tokens": 15810036.0, + "step": 619 + }, + { + "epoch": 0.06808697562047002, + "grad_norm": 2.6059556007385254, + "learning_rate": 3.397365532381998e-07, + "loss": 1.0957, + "mean_token_accuracy": 0.6792992949485779, + "num_tokens": 15836803.0, + "step": 620 + }, + { + "epoch": 0.06819679332308368, + "grad_norm": 2.427591323852539, + "learning_rate": 3.402854006586169e-07, + "loss": 1.1799, + "mean_token_accuracy": 0.6517577767372131, + "num_tokens": 15865398.0, + "step": 621 + }, + { + "epoch": 0.06830661102569734, + "grad_norm": 2.647636890411377, + "learning_rate": 3.40834248079034e-07, + "loss": 1.151, + "mean_token_accuracy": 0.6646771430969238, + "num_tokens": 15889499.0, + "step": 622 + }, + { + "epoch": 0.06841642872831101, + "grad_norm": 2.7054340839385986, + "learning_rate": 3.4138309549945113e-07, + "loss": 1.0788, + "mean_token_accuracy": 0.6827706694602966, + "num_tokens": 15912410.0, + "step": 623 + }, + { + "epoch": 0.06852624643092467, + "grad_norm": 2.858544111251831, + "learning_rate": 3.4193194291986825e-07, + "loss": 1.0988, + "mean_token_accuracy": 0.6892728209495544, + "num_tokens": 15935146.0, + "step": 624 + }, + { + "epoch": 0.06863606413353833, + "grad_norm": 2.695779323577881, + "learning_rate": 3.4248079034028536e-07, + "loss": 1.0261, + "mean_token_accuracy": 0.6899174451828003, + "num_tokens": 15957158.0, + "step": 625 + }, + { + "epoch": 0.06874588183615199, + "grad_norm": 2.5337090492248535, + "learning_rate": 3.430296377607025e-07, + "loss": 1.1079, + "mean_token_accuracy": 0.6794678568840027, + "num_tokens": 15982045.0, + "step": 626 + }, + { + "epoch": 0.06885569953876565, + "grad_norm": 2.5277414321899414, + "learning_rate": 3.4357848518111965e-07, + "loss": 1.0664, + "mean_token_accuracy": 0.6879560351371765, + "num_tokens": 16006860.0, + "step": 627 + }, + { + "epoch": 0.06896551724137931, + "grad_norm": 2.8189308643341064, + "learning_rate": 3.4412733260153676e-07, + "loss": 1.1148, + "mean_token_accuracy": 0.6801941990852356, + "num_tokens": 16027816.0, + "step": 628 + }, + { + "epoch": 0.06907533494399297, + "grad_norm": 2.181428909301758, + "learning_rate": 3.446761800219539e-07, + "loss": 1.1035, + "mean_token_accuracy": 0.6896543502807617, + "num_tokens": 16058556.0, + "step": 629 + }, + { + "epoch": 0.06918515264660663, + "grad_norm": 2.951331615447998, + "learning_rate": 3.4522502744237104e-07, + "loss": 1.0531, + "mean_token_accuracy": 0.6959325075149536, + "num_tokens": 16078933.0, + "step": 630 + }, + { + "epoch": 0.06929497034922029, + "grad_norm": 2.1025326251983643, + "learning_rate": 3.4577387486278816e-07, + "loss": 1.1183, + "mean_token_accuracy": 0.6750094890594482, + "num_tokens": 16113473.0, + "step": 631 + }, + { + "epoch": 0.06940478805183395, + "grad_norm": 2.3692197799682617, + "learning_rate": 3.463227222832053e-07, + "loss": 1.0975, + "mean_token_accuracy": 0.6764533519744873, + "num_tokens": 16141550.0, + "step": 632 + }, + { + "epoch": 0.06951460575444762, + "grad_norm": 2.4781243801116943, + "learning_rate": 3.4687156970362234e-07, + "loss": 1.0396, + "mean_token_accuracy": 0.6926468014717102, + "num_tokens": 16167093.0, + "step": 633 + }, + { + "epoch": 0.06962442345706128, + "grad_norm": 2.9535346031188965, + "learning_rate": 3.474204171240395e-07, + "loss": 1.2441, + "mean_token_accuracy": 0.6599279642105103, + "num_tokens": 16187873.0, + "step": 634 + }, + { + "epoch": 0.06973424115967494, + "grad_norm": 2.828838586807251, + "learning_rate": 3.479692645444566e-07, + "loss": 1.0618, + "mean_token_accuracy": 0.6844953298568726, + "num_tokens": 16211058.0, + "step": 635 + }, + { + "epoch": 0.0698440588622886, + "grad_norm": 2.4895882606506348, + "learning_rate": 3.4851811196487374e-07, + "loss": 1.1854, + "mean_token_accuracy": 0.6592599153518677, + "num_tokens": 16236462.0, + "step": 636 + }, + { + "epoch": 0.06995387656490226, + "grad_norm": 2.9574759006500244, + "learning_rate": 3.4906695938529085e-07, + "loss": 1.1321, + "mean_token_accuracy": 0.6711615324020386, + "num_tokens": 16258268.0, + "step": 637 + }, + { + "epoch": 0.07006369426751592, + "grad_norm": 2.3067121505737305, + "learning_rate": 3.49615806805708e-07, + "loss": 1.0598, + "mean_token_accuracy": 0.6911120414733887, + "num_tokens": 16285889.0, + "step": 638 + }, + { + "epoch": 0.07017351197012958, + "grad_norm": 3.024106025695801, + "learning_rate": 3.5016465422612513e-07, + "loss": 1.0462, + "mean_token_accuracy": 0.6905795335769653, + "num_tokens": 16305553.0, + "step": 639 + }, + { + "epoch": 0.07028332967274324, + "grad_norm": 3.126769542694092, + "learning_rate": 3.5071350164654225e-07, + "loss": 1.0005, + "mean_token_accuracy": 0.7028473615646362, + "num_tokens": 16325201.0, + "step": 640 + }, + { + "epoch": 0.0703931473753569, + "grad_norm": 2.2513749599456787, + "learning_rate": 3.512623490669594e-07, + "loss": 1.1904, + "mean_token_accuracy": 0.6551594734191895, + "num_tokens": 16354220.0, + "step": 641 + }, + { + "epoch": 0.07050296507797058, + "grad_norm": 2.7467398643493652, + "learning_rate": 3.518111964873765e-07, + "loss": 1.0491, + "mean_token_accuracy": 0.6850873231887817, + "num_tokens": 16375229.0, + "step": 642 + }, + { + "epoch": 0.07061278278058424, + "grad_norm": 2.962629795074463, + "learning_rate": 3.523600439077936e-07, + "loss": 1.0819, + "mean_token_accuracy": 0.6852922439575195, + "num_tokens": 16394352.0, + "step": 643 + }, + { + "epoch": 0.0707226004831979, + "grad_norm": 2.450556993484497, + "learning_rate": 3.529088913282107e-07, + "loss": 1.1293, + "mean_token_accuracy": 0.6690352559089661, + "num_tokens": 16419940.0, + "step": 644 + }, + { + "epoch": 0.07083241818581155, + "grad_norm": 2.9242751598358154, + "learning_rate": 3.534577387486279e-07, + "loss": 1.1204, + "mean_token_accuracy": 0.6754668354988098, + "num_tokens": 16443801.0, + "step": 645 + }, + { + "epoch": 0.07094223588842521, + "grad_norm": 2.3292460441589355, + "learning_rate": 3.54006586169045e-07, + "loss": 1.1804, + "mean_token_accuracy": 0.6557445526123047, + "num_tokens": 16473465.0, + "step": 646 + }, + { + "epoch": 0.07105205359103887, + "grad_norm": 2.3606083393096924, + "learning_rate": 3.545554335894621e-07, + "loss": 1.2047, + "mean_token_accuracy": 0.6591725945472717, + "num_tokens": 16503136.0, + "step": 647 + }, + { + "epoch": 0.07116187129365253, + "grad_norm": 2.980391025543213, + "learning_rate": 3.551042810098792e-07, + "loss": 1.113, + "mean_token_accuracy": 0.6763558387756348, + "num_tokens": 16522991.0, + "step": 648 + }, + { + "epoch": 0.0712716889962662, + "grad_norm": 2.486706495285034, + "learning_rate": 3.556531284302964e-07, + "loss": 0.9895, + "mean_token_accuracy": 0.710801899433136, + "num_tokens": 16546935.0, + "step": 649 + }, + { + "epoch": 0.07138150669887985, + "grad_norm": 2.4655234813690186, + "learning_rate": 3.562019758507135e-07, + "loss": 1.0753, + "mean_token_accuracy": 0.6839451789855957, + "num_tokens": 16572135.0, + "step": 650 + }, + { + "epoch": 0.07149132440149351, + "grad_norm": 2.8320837020874023, + "learning_rate": 3.5675082327113057e-07, + "loss": 1.1099, + "mean_token_accuracy": 0.6834249496459961, + "num_tokens": 16592938.0, + "step": 651 + }, + { + "epoch": 0.07160114210410719, + "grad_norm": 2.3677244186401367, + "learning_rate": 3.5729967069154774e-07, + "loss": 1.0513, + "mean_token_accuracy": 0.6884775161743164, + "num_tokens": 16620417.0, + "step": 652 + }, + { + "epoch": 0.07171095980672085, + "grad_norm": 2.3832290172576904, + "learning_rate": 3.5784851811196485e-07, + "loss": 1.0963, + "mean_token_accuracy": 0.6785011291503906, + "num_tokens": 16648892.0, + "step": 653 + }, + { + "epoch": 0.07182077750933451, + "grad_norm": 2.6132125854492188, + "learning_rate": 3.5839736553238197e-07, + "loss": 1.1101, + "mean_token_accuracy": 0.6782544851303101, + "num_tokens": 16673236.0, + "step": 654 + }, + { + "epoch": 0.07193059521194817, + "grad_norm": 2.791177988052368, + "learning_rate": 3.589462129527991e-07, + "loss": 1.0278, + "mean_token_accuracy": 0.6923882961273193, + "num_tokens": 16694979.0, + "step": 655 + }, + { + "epoch": 0.07204041291456183, + "grad_norm": 2.495316743850708, + "learning_rate": 3.5949506037321625e-07, + "loss": 1.0899, + "mean_token_accuracy": 0.682546854019165, + "num_tokens": 16720668.0, + "step": 656 + }, + { + "epoch": 0.07215023061717549, + "grad_norm": 2.658609390258789, + "learning_rate": 3.6004390779363337e-07, + "loss": 1.1874, + "mean_token_accuracy": 0.6542746424674988, + "num_tokens": 16744308.0, + "step": 657 + }, + { + "epoch": 0.07226004831978915, + "grad_norm": 2.2996089458465576, + "learning_rate": 3.605927552140505e-07, + "loss": 1.1693, + "mean_token_accuracy": 0.6650842428207397, + "num_tokens": 16774168.0, + "step": 658 + }, + { + "epoch": 0.0723698660224028, + "grad_norm": 2.7719504833221436, + "learning_rate": 3.611416026344676e-07, + "loss": 1.1279, + "mean_token_accuracy": 0.6684710383415222, + "num_tokens": 16796521.0, + "step": 659 + }, + { + "epoch": 0.07247968372501647, + "grad_norm": 2.503528356552124, + "learning_rate": 3.6169045005488477e-07, + "loss": 1.1141, + "mean_token_accuracy": 0.6785916090011597, + "num_tokens": 16821077.0, + "step": 660 + }, + { + "epoch": 0.07258950142763014, + "grad_norm": 2.3331520557403564, + "learning_rate": 3.6223929747530183e-07, + "loss": 1.0977, + "mean_token_accuracy": 0.6855172514915466, + "num_tokens": 16851165.0, + "step": 661 + }, + { + "epoch": 0.0726993191302438, + "grad_norm": 2.389235496520996, + "learning_rate": 3.6278814489571894e-07, + "loss": 1.1193, + "mean_token_accuracy": 0.6726590394973755, + "num_tokens": 16878321.0, + "step": 662 + }, + { + "epoch": 0.07280913683285746, + "grad_norm": 2.3532016277313232, + "learning_rate": 3.6333699231613606e-07, + "loss": 1.1465, + "mean_token_accuracy": 0.6636040210723877, + "num_tokens": 16908286.0, + "step": 663 + }, + { + "epoch": 0.07291895453547112, + "grad_norm": 2.4819529056549072, + "learning_rate": 3.6388583973655323e-07, + "loss": 1.1564, + "mean_token_accuracy": 0.6614657640457153, + "num_tokens": 16934773.0, + "step": 664 + }, + { + "epoch": 0.07302877223808478, + "grad_norm": 2.5088415145874023, + "learning_rate": 3.6443468715697034e-07, + "loss": 1.0814, + "mean_token_accuracy": 0.6841531991958618, + "num_tokens": 16962061.0, + "step": 665 + }, + { + "epoch": 0.07313858994069844, + "grad_norm": 2.3436293601989746, + "learning_rate": 3.6498353457738746e-07, + "loss": 1.0924, + "mean_token_accuracy": 0.6790987253189087, + "num_tokens": 16988811.0, + "step": 666 + }, + { + "epoch": 0.0732484076433121, + "grad_norm": 2.513204336166382, + "learning_rate": 3.655323819978046e-07, + "loss": 1.0629, + "mean_token_accuracy": 0.6853093504905701, + "num_tokens": 17012899.0, + "step": 667 + }, + { + "epoch": 0.07335822534592576, + "grad_norm": 2.603746175765991, + "learning_rate": 3.6608122941822174e-07, + "loss": 1.1591, + "mean_token_accuracy": 0.6742569208145142, + "num_tokens": 17037184.0, + "step": 668 + }, + { + "epoch": 0.07346804304853942, + "grad_norm": 2.761530876159668, + "learning_rate": 3.6663007683863886e-07, + "loss": 0.9992, + "mean_token_accuracy": 0.7032953500747681, + "num_tokens": 17057078.0, + "step": 669 + }, + { + "epoch": 0.07357786075115308, + "grad_norm": 2.390747308731079, + "learning_rate": 3.671789242590559e-07, + "loss": 1.1029, + "mean_token_accuracy": 0.678844690322876, + "num_tokens": 17083148.0, + "step": 670 + }, + { + "epoch": 0.07368767845376675, + "grad_norm": 2.2509665489196777, + "learning_rate": 3.677277716794731e-07, + "loss": 1.1073, + "mean_token_accuracy": 0.6812499761581421, + "num_tokens": 17111649.0, + "step": 671 + }, + { + "epoch": 0.07379749615638041, + "grad_norm": 2.4790778160095215, + "learning_rate": 3.682766190998902e-07, + "loss": 1.1311, + "mean_token_accuracy": 0.673592209815979, + "num_tokens": 17137772.0, + "step": 672 + }, + { + "epoch": 0.07390731385899407, + "grad_norm": 2.5734081268310547, + "learning_rate": 3.688254665203073e-07, + "loss": 1.0087, + "mean_token_accuracy": 0.7036850452423096, + "num_tokens": 17161510.0, + "step": 673 + }, + { + "epoch": 0.07401713156160773, + "grad_norm": 2.359140396118164, + "learning_rate": 3.6937431394072443e-07, + "loss": 1.1537, + "mean_token_accuracy": 0.6603296995162964, + "num_tokens": 17188444.0, + "step": 674 + }, + { + "epoch": 0.07412694926422139, + "grad_norm": 2.2967309951782227, + "learning_rate": 3.699231613611416e-07, + "loss": 1.1358, + "mean_token_accuracy": 0.6678065061569214, + "num_tokens": 17219888.0, + "step": 675 + }, + { + "epoch": 0.07423676696683505, + "grad_norm": 2.730719804763794, + "learning_rate": 3.704720087815587e-07, + "loss": 1.0105, + "mean_token_accuracy": 0.7034118175506592, + "num_tokens": 17240592.0, + "step": 676 + }, + { + "epoch": 0.07434658466944871, + "grad_norm": 2.8933911323547363, + "learning_rate": 3.7102085620197583e-07, + "loss": 1.1483, + "mean_token_accuracy": 0.6599701642990112, + "num_tokens": 17261912.0, + "step": 677 + }, + { + "epoch": 0.07445640237206237, + "grad_norm": 3.2464489936828613, + "learning_rate": 3.71569703622393e-07, + "loss": 1.0957, + "mean_token_accuracy": 0.6771103143692017, + "num_tokens": 17282066.0, + "step": 678 + }, + { + "epoch": 0.07456622007467603, + "grad_norm": 2.442330837249756, + "learning_rate": 3.721185510428101e-07, + "loss": 1.0786, + "mean_token_accuracy": 0.6762453317642212, + "num_tokens": 17307266.0, + "step": 679 + }, + { + "epoch": 0.0746760377772897, + "grad_norm": 2.7152867317199707, + "learning_rate": 3.726673984632272e-07, + "loss": 1.0439, + "mean_token_accuracy": 0.7031011581420898, + "num_tokens": 17329900.0, + "step": 680 + }, + { + "epoch": 0.07478585547990337, + "grad_norm": 2.4962124824523926, + "learning_rate": 3.732162458836443e-07, + "loss": 1.1154, + "mean_token_accuracy": 0.6707748174667358, + "num_tokens": 17355112.0, + "step": 681 + }, + { + "epoch": 0.07489567318251703, + "grad_norm": 2.4082725048065186, + "learning_rate": 3.7376509330406146e-07, + "loss": 1.1909, + "mean_token_accuracy": 0.6531050801277161, + "num_tokens": 17382854.0, + "step": 682 + }, + { + "epoch": 0.07500549088513069, + "grad_norm": 2.5478975772857666, + "learning_rate": 3.743139407244786e-07, + "loss": 1.0721, + "mean_token_accuracy": 0.6837007999420166, + "num_tokens": 17407950.0, + "step": 683 + }, + { + "epoch": 0.07511530858774434, + "grad_norm": 2.440763235092163, + "learning_rate": 3.748627881448957e-07, + "loss": 1.1065, + "mean_token_accuracy": 0.6712014675140381, + "num_tokens": 17434589.0, + "step": 684 + }, + { + "epoch": 0.075225126290358, + "grad_norm": 2.575958490371704, + "learning_rate": 3.754116355653128e-07, + "loss": 1.0641, + "mean_token_accuracy": 0.6832867860794067, + "num_tokens": 17458527.0, + "step": 685 + }, + { + "epoch": 0.07533494399297166, + "grad_norm": 2.470611572265625, + "learning_rate": 3.7596048298573e-07, + "loss": 1.2339, + "mean_token_accuracy": 0.6467856764793396, + "num_tokens": 17485636.0, + "step": 686 + }, + { + "epoch": 0.07544476169558532, + "grad_norm": 2.603999376296997, + "learning_rate": 3.765093304061471e-07, + "loss": 1.1416, + "mean_token_accuracy": 0.6729139089584351, + "num_tokens": 17509497.0, + "step": 687 + }, + { + "epoch": 0.07555457939819898, + "grad_norm": 2.2970163822174072, + "learning_rate": 3.770581778265642e-07, + "loss": 1.0183, + "mean_token_accuracy": 0.7002947330474854, + "num_tokens": 17536264.0, + "step": 688 + }, + { + "epoch": 0.07566439710081264, + "grad_norm": 2.5029356479644775, + "learning_rate": 3.7760702524698137e-07, + "loss": 1.0342, + "mean_token_accuracy": 0.6921393871307373, + "num_tokens": 17560430.0, + "step": 689 + }, + { + "epoch": 0.07577421480342632, + "grad_norm": 2.2157480716705322, + "learning_rate": 3.7815587266739844e-07, + "loss": 1.1039, + "mean_token_accuracy": 0.6834001541137695, + "num_tokens": 17589133.0, + "step": 690 + }, + { + "epoch": 0.07588403250603998, + "grad_norm": 2.5103907585144043, + "learning_rate": 3.7870472008781555e-07, + "loss": 1.1347, + "mean_token_accuracy": 0.6701574325561523, + "num_tokens": 17613230.0, + "step": 691 + }, + { + "epoch": 0.07599385020865364, + "grad_norm": 2.5683627128601074, + "learning_rate": 3.7925356750823267e-07, + "loss": 1.0861, + "mean_token_accuracy": 0.6757040023803711, + "num_tokens": 17635123.0, + "step": 692 + }, + { + "epoch": 0.0761036679112673, + "grad_norm": 2.0800564289093018, + "learning_rate": 3.7980241492864983e-07, + "loss": 1.1975, + "mean_token_accuracy": 0.6537666320800781, + "num_tokens": 17671633.0, + "step": 693 + }, + { + "epoch": 0.07621348561388096, + "grad_norm": 2.3576719760894775, + "learning_rate": 3.8035126234906695e-07, + "loss": 1.083, + "mean_token_accuracy": 0.6845002770423889, + "num_tokens": 17701602.0, + "step": 694 + }, + { + "epoch": 0.07632330331649462, + "grad_norm": 3.129758596420288, + "learning_rate": 3.8090010976948406e-07, + "loss": 0.9808, + "mean_token_accuracy": 0.6991605162620544, + "num_tokens": 17719106.0, + "step": 695 + }, + { + "epoch": 0.07643312101910828, + "grad_norm": 2.4175009727478027, + "learning_rate": 3.814489571899012e-07, + "loss": 1.0998, + "mean_token_accuracy": 0.6754868030548096, + "num_tokens": 17745911.0, + "step": 696 + }, + { + "epoch": 0.07654293872172194, + "grad_norm": 2.5951590538024902, + "learning_rate": 3.8199780461031835e-07, + "loss": 1.1194, + "mean_token_accuracy": 0.6759698390960693, + "num_tokens": 17769168.0, + "step": 697 + }, + { + "epoch": 0.0766527564243356, + "grad_norm": 2.573432207107544, + "learning_rate": 3.8254665203073546e-07, + "loss": 1.1001, + "mean_token_accuracy": 0.6755691766738892, + "num_tokens": 17793050.0, + "step": 698 + }, + { + "epoch": 0.07676257412694927, + "grad_norm": 2.2984654903411865, + "learning_rate": 3.830954994511525e-07, + "loss": 1.1402, + "mean_token_accuracy": 0.6719126105308533, + "num_tokens": 17823624.0, + "step": 699 + }, + { + "epoch": 0.07687239182956293, + "grad_norm": 2.7613449096679688, + "learning_rate": 3.836443468715697e-07, + "loss": 1.115, + "mean_token_accuracy": 0.674934983253479, + "num_tokens": 17844713.0, + "step": 700 + }, + { + "epoch": 0.07698220953217659, + "grad_norm": 2.770494222640991, + "learning_rate": 3.841931942919868e-07, + "loss": 0.9829, + "mean_token_accuracy": 0.7064495086669922, + "num_tokens": 17863754.0, + "step": 701 + }, + { + "epoch": 0.07709202723479025, + "grad_norm": 3.04384708404541, + "learning_rate": 3.847420417124039e-07, + "loss": 1.0044, + "mean_token_accuracy": 0.6988327503204346, + "num_tokens": 17880695.0, + "step": 702 + }, + { + "epoch": 0.07720184493740391, + "grad_norm": 2.570000171661377, + "learning_rate": 3.8529088913282104e-07, + "loss": 0.9691, + "mean_token_accuracy": 0.7082796096801758, + "num_tokens": 17902102.0, + "step": 703 + }, + { + "epoch": 0.07731166264001757, + "grad_norm": 3.0129506587982178, + "learning_rate": 3.858397365532382e-07, + "loss": 0.9554, + "mean_token_accuracy": 0.7032852172851562, + "num_tokens": 17919627.0, + "step": 704 + }, + { + "epoch": 0.07742148034263123, + "grad_norm": 2.4123668670654297, + "learning_rate": 3.863885839736553e-07, + "loss": 1.1573, + "mean_token_accuracy": 0.6577290296554565, + "num_tokens": 17947017.0, + "step": 705 + }, + { + "epoch": 0.07753129804524489, + "grad_norm": 2.7207143306732178, + "learning_rate": 3.8693743139407244e-07, + "loss": 1.0194, + "mean_token_accuracy": 0.6989102363586426, + "num_tokens": 17968626.0, + "step": 706 + }, + { + "epoch": 0.07764111574785855, + "grad_norm": 2.3001956939697266, + "learning_rate": 3.8748627881448955e-07, + "loss": 1.1333, + "mean_token_accuracy": 0.668700098991394, + "num_tokens": 17998556.0, + "step": 707 + }, + { + "epoch": 0.07775093345047221, + "grad_norm": 2.5089330673217773, + "learning_rate": 3.880351262349067e-07, + "loss": 1.0259, + "mean_token_accuracy": 0.6960923671722412, + "num_tokens": 18022135.0, + "step": 708 + }, + { + "epoch": 0.07786075115308588, + "grad_norm": 2.6710779666900635, + "learning_rate": 3.885839736553238e-07, + "loss": 1.0225, + "mean_token_accuracy": 0.6916399002075195, + "num_tokens": 18042975.0, + "step": 709 + }, + { + "epoch": 0.07797056885569954, + "grad_norm": 2.5116727352142334, + "learning_rate": 3.891328210757409e-07, + "loss": 1.0534, + "mean_token_accuracy": 0.6851282119750977, + "num_tokens": 18067276.0, + "step": 710 + }, + { + "epoch": 0.0780803865583132, + "grad_norm": 2.3859620094299316, + "learning_rate": 3.8968166849615807e-07, + "loss": 1.1212, + "mean_token_accuracy": 0.6757639646530151, + "num_tokens": 18096304.0, + "step": 711 + }, + { + "epoch": 0.07819020426092686, + "grad_norm": 2.1829419136047363, + "learning_rate": 3.902305159165752e-07, + "loss": 1.1358, + "mean_token_accuracy": 0.6755061745643616, + "num_tokens": 18127570.0, + "step": 712 + }, + { + "epoch": 0.07830002196354052, + "grad_norm": 2.611192464828491, + "learning_rate": 3.907793633369923e-07, + "loss": 1.0668, + "mean_token_accuracy": 0.6884706616401672, + "num_tokens": 18150016.0, + "step": 713 + }, + { + "epoch": 0.07840983966615418, + "grad_norm": 2.5354154109954834, + "learning_rate": 3.913282107574094e-07, + "loss": 1.06, + "mean_token_accuracy": 0.6824491024017334, + "num_tokens": 18171041.0, + "step": 714 + }, + { + "epoch": 0.07851965736876784, + "grad_norm": 2.508070230484009, + "learning_rate": 3.918770581778266e-07, + "loss": 1.0985, + "mean_token_accuracy": 0.6766155958175659, + "num_tokens": 18195342.0, + "step": 715 + }, + { + "epoch": 0.0786294750713815, + "grad_norm": 2.4151511192321777, + "learning_rate": 3.924259055982437e-07, + "loss": 1.0424, + "mean_token_accuracy": 0.7134076356887817, + "num_tokens": 18222219.0, + "step": 716 + }, + { + "epoch": 0.07873929277399516, + "grad_norm": 2.6129777431488037, + "learning_rate": 3.929747530186608e-07, + "loss": 1.0393, + "mean_token_accuracy": 0.6914509534835815, + "num_tokens": 18245157.0, + "step": 717 + }, + { + "epoch": 0.07884911047660884, + "grad_norm": 2.380469799041748, + "learning_rate": 3.935236004390779e-07, + "loss": 1.0462, + "mean_token_accuracy": 0.690671443939209, + "num_tokens": 18273394.0, + "step": 718 + }, + { + "epoch": 0.0789589281792225, + "grad_norm": 2.221034049987793, + "learning_rate": 3.9407244785949504e-07, + "loss": 1.1719, + "mean_token_accuracy": 0.6800537109375, + "num_tokens": 18304177.0, + "step": 719 + }, + { + "epoch": 0.07906874588183616, + "grad_norm": 2.413135051727295, + "learning_rate": 3.9462129527991216e-07, + "loss": 1.0551, + "mean_token_accuracy": 0.6832575798034668, + "num_tokens": 18331323.0, + "step": 720 + }, + { + "epoch": 0.07917856358444982, + "grad_norm": 2.3875997066497803, + "learning_rate": 3.9517014270032927e-07, + "loss": 1.0723, + "mean_token_accuracy": 0.682985782623291, + "num_tokens": 18358807.0, + "step": 721 + }, + { + "epoch": 0.07928838128706348, + "grad_norm": 2.666807174682617, + "learning_rate": 3.957189901207464e-07, + "loss": 1.1279, + "mean_token_accuracy": 0.6681783199310303, + "num_tokens": 18381162.0, + "step": 722 + }, + { + "epoch": 0.07939819898967714, + "grad_norm": 2.061647891998291, + "learning_rate": 3.9626783754116356e-07, + "loss": 1.0887, + "mean_token_accuracy": 0.6776692271232605, + "num_tokens": 18412150.0, + "step": 723 + }, + { + "epoch": 0.0795080166922908, + "grad_norm": 2.4166646003723145, + "learning_rate": 3.9681668496158067e-07, + "loss": 1.0648, + "mean_token_accuracy": 0.683131217956543, + "num_tokens": 18437274.0, + "step": 724 + }, + { + "epoch": 0.07961783439490445, + "grad_norm": 2.302172899246216, + "learning_rate": 3.973655323819978e-07, + "loss": 1.0575, + "mean_token_accuracy": 0.7007266283035278, + "num_tokens": 18466113.0, + "step": 725 + }, + { + "epoch": 0.07972765209751811, + "grad_norm": 2.3733558654785156, + "learning_rate": 3.9791437980241495e-07, + "loss": 0.9869, + "mean_token_accuracy": 0.7034314870834351, + "num_tokens": 18491123.0, + "step": 726 + }, + { + "epoch": 0.07983746980013177, + "grad_norm": 2.1647701263427734, + "learning_rate": 3.9846322722283207e-07, + "loss": 1.0526, + "mean_token_accuracy": 0.6907070279121399, + "num_tokens": 18523641.0, + "step": 727 + }, + { + "epoch": 0.07994728750274545, + "grad_norm": 2.336266279220581, + "learning_rate": 3.9901207464324913e-07, + "loss": 1.0305, + "mean_token_accuracy": 0.6989829540252686, + "num_tokens": 18551229.0, + "step": 728 + }, + { + "epoch": 0.08005710520535911, + "grad_norm": 2.590445041656494, + "learning_rate": 3.9956092206366625e-07, + "loss": 1.115, + "mean_token_accuracy": 0.6729111671447754, + "num_tokens": 18576042.0, + "step": 729 + }, + { + "epoch": 0.08016692290797277, + "grad_norm": 3.0689311027526855, + "learning_rate": 4.001097694840834e-07, + "loss": 1.1434, + "mean_token_accuracy": 0.6865409016609192, + "num_tokens": 18594387.0, + "step": 730 + }, + { + "epoch": 0.08027674061058643, + "grad_norm": 2.379546880722046, + "learning_rate": 4.0065861690450053e-07, + "loss": 1.0825, + "mean_token_accuracy": 0.6815522909164429, + "num_tokens": 18621051.0, + "step": 731 + }, + { + "epoch": 0.08038655831320009, + "grad_norm": 2.318584442138672, + "learning_rate": 4.0120746432491765e-07, + "loss": 1.2016, + "mean_token_accuracy": 0.6514393091201782, + "num_tokens": 18649786.0, + "step": 732 + }, + { + "epoch": 0.08049637601581375, + "grad_norm": 2.3704302310943604, + "learning_rate": 4.0175631174533476e-07, + "loss": 1.0562, + "mean_token_accuracy": 0.6820507049560547, + "num_tokens": 18679119.0, + "step": 733 + }, + { + "epoch": 0.08060619371842741, + "grad_norm": 2.4759511947631836, + "learning_rate": 4.0230515916575193e-07, + "loss": 1.108, + "mean_token_accuracy": 0.676353394985199, + "num_tokens": 18702104.0, + "step": 734 + }, + { + "epoch": 0.08071601142104107, + "grad_norm": 2.129660129547119, + "learning_rate": 4.0285400658616904e-07, + "loss": 1.1015, + "mean_token_accuracy": 0.6895496845245361, + "num_tokens": 18735005.0, + "step": 735 + }, + { + "epoch": 0.08082582912365473, + "grad_norm": 2.735456943511963, + "learning_rate": 4.0340285400658616e-07, + "loss": 1.078, + "mean_token_accuracy": 0.6819308400154114, + "num_tokens": 18756528.0, + "step": 736 + }, + { + "epoch": 0.0809356468262684, + "grad_norm": 2.609398603439331, + "learning_rate": 4.039517014270033e-07, + "loss": 0.9699, + "mean_token_accuracy": 0.7127725481987, + "num_tokens": 18778435.0, + "step": 737 + }, + { + "epoch": 0.08104546452888206, + "grad_norm": 2.729672908782959, + "learning_rate": 4.045005488474204e-07, + "loss": 1.1213, + "mean_token_accuracy": 0.6748394966125488, + "num_tokens": 18799083.0, + "step": 738 + }, + { + "epoch": 0.08115528223149572, + "grad_norm": 2.3961076736450195, + "learning_rate": 4.050493962678375e-07, + "loss": 1.077, + "mean_token_accuracy": 0.6778619289398193, + "num_tokens": 18824279.0, + "step": 739 + }, + { + "epoch": 0.08126509993410938, + "grad_norm": 2.3969674110412598, + "learning_rate": 4.055982436882546e-07, + "loss": 0.9714, + "mean_token_accuracy": 0.7116500735282898, + "num_tokens": 18848448.0, + "step": 740 + }, + { + "epoch": 0.08137491763672304, + "grad_norm": 2.2247605323791504, + "learning_rate": 4.061470911086718e-07, + "loss": 1.0121, + "mean_token_accuracy": 0.6977449655532837, + "num_tokens": 18875596.0, + "step": 741 + }, + { + "epoch": 0.0814847353393367, + "grad_norm": 2.462596893310547, + "learning_rate": 4.066959385290889e-07, + "loss": 1.1199, + "mean_token_accuracy": 0.6713079214096069, + "num_tokens": 18900243.0, + "step": 742 + }, + { + "epoch": 0.08159455304195036, + "grad_norm": 2.2303354740142822, + "learning_rate": 4.07244785949506e-07, + "loss": 1.1242, + "mean_token_accuracy": 0.6745454668998718, + "num_tokens": 18929244.0, + "step": 743 + }, + { + "epoch": 0.08170437074456402, + "grad_norm": 2.2556560039520264, + "learning_rate": 4.0779363336992313e-07, + "loss": 1.0759, + "mean_token_accuracy": 0.6815632581710815, + "num_tokens": 18956751.0, + "step": 744 + }, + { + "epoch": 0.08181418844717768, + "grad_norm": 2.301252841949463, + "learning_rate": 4.083424807903403e-07, + "loss": 1.1124, + "mean_token_accuracy": 0.6731385588645935, + "num_tokens": 18985276.0, + "step": 745 + }, + { + "epoch": 0.08192400614979134, + "grad_norm": 2.254307746887207, + "learning_rate": 4.0889132821075737e-07, + "loss": 0.9684, + "mean_token_accuracy": 0.7164543271064758, + "num_tokens": 19014874.0, + "step": 746 + }, + { + "epoch": 0.08203382385240501, + "grad_norm": 2.1306254863739014, + "learning_rate": 4.094401756311745e-07, + "loss": 1.0128, + "mean_token_accuracy": 0.6969611644744873, + "num_tokens": 19045283.0, + "step": 747 + }, + { + "epoch": 0.08214364155501867, + "grad_norm": 2.2629408836364746, + "learning_rate": 4.0998902305159165e-07, + "loss": 1.0759, + "mean_token_accuracy": 0.6834408044815063, + "num_tokens": 19073722.0, + "step": 748 + }, + { + "epoch": 0.08225345925763233, + "grad_norm": 2.3222053050994873, + "learning_rate": 4.1053787047200876e-07, + "loss": 1.2008, + "mean_token_accuracy": 0.6521885395050049, + "num_tokens": 19102303.0, + "step": 749 + }, + { + "epoch": 0.08236327696024599, + "grad_norm": 2.629955768585205, + "learning_rate": 4.110867178924259e-07, + "loss": 1.1067, + "mean_token_accuracy": 0.6779636144638062, + "num_tokens": 19123914.0, + "step": 750 + }, + { + "epoch": 0.08247309466285965, + "grad_norm": 2.1522481441497803, + "learning_rate": 4.11635565312843e-07, + "loss": 1.0535, + "mean_token_accuracy": 0.6898294687271118, + "num_tokens": 19155265.0, + "step": 751 + }, + { + "epoch": 0.08258291236547331, + "grad_norm": 2.1520047187805176, + "learning_rate": 4.1218441273326016e-07, + "loss": 1.1504, + "mean_token_accuracy": 0.6676845550537109, + "num_tokens": 19186254.0, + "step": 752 + }, + { + "epoch": 0.08269273006808697, + "grad_norm": 2.3935844898223877, + "learning_rate": 4.127332601536773e-07, + "loss": 0.9814, + "mean_token_accuracy": 0.7083364725112915, + "num_tokens": 19210785.0, + "step": 753 + }, + { + "epoch": 0.08280254777070063, + "grad_norm": 2.4602065086364746, + "learning_rate": 4.132821075740944e-07, + "loss": 1.1273, + "mean_token_accuracy": 0.6787582635879517, + "num_tokens": 19238196.0, + "step": 754 + }, + { + "epoch": 0.08291236547331429, + "grad_norm": 2.5685343742370605, + "learning_rate": 4.138309549945115e-07, + "loss": 1.0855, + "mean_token_accuracy": 0.6825853586196899, + "num_tokens": 19262098.0, + "step": 755 + }, + { + "epoch": 0.08302218317592797, + "grad_norm": 2.266939401626587, + "learning_rate": 4.143798024149286e-07, + "loss": 1.1043, + "mean_token_accuracy": 0.685926079750061, + "num_tokens": 19288938.0, + "step": 756 + }, + { + "epoch": 0.08313200087854163, + "grad_norm": 2.6340408325195312, + "learning_rate": 4.1492864983534574e-07, + "loss": 1.1086, + "mean_token_accuracy": 0.6724945306777954, + "num_tokens": 19310019.0, + "step": 757 + }, + { + "epoch": 0.08324181858115529, + "grad_norm": 2.635141611099243, + "learning_rate": 4.1547749725576285e-07, + "loss": 1.0286, + "mean_token_accuracy": 0.6948949694633484, + "num_tokens": 19330595.0, + "step": 758 + }, + { + "epoch": 0.08335163628376895, + "grad_norm": 2.2691338062286377, + "learning_rate": 4.1602634467618e-07, + "loss": 1.031, + "mean_token_accuracy": 0.6864601373672485, + "num_tokens": 19356864.0, + "step": 759 + }, + { + "epoch": 0.0834614539863826, + "grad_norm": 2.4898529052734375, + "learning_rate": 4.1657519209659714e-07, + "loss": 0.9706, + "mean_token_accuracy": 0.7081525325775146, + "num_tokens": 19380012.0, + "step": 760 + }, + { + "epoch": 0.08357127168899627, + "grad_norm": 2.4784488677978516, + "learning_rate": 4.1712403951701425e-07, + "loss": 1.0087, + "mean_token_accuracy": 0.7009560465812683, + "num_tokens": 19403630.0, + "step": 761 + }, + { + "epoch": 0.08368108939160993, + "grad_norm": 2.181903839111328, + "learning_rate": 4.1767288693743137e-07, + "loss": 1.0717, + "mean_token_accuracy": 0.6892721056938171, + "num_tokens": 19434864.0, + "step": 762 + }, + { + "epoch": 0.08379090709422359, + "grad_norm": 2.564821243286133, + "learning_rate": 4.1822173435784854e-07, + "loss": 1.0915, + "mean_token_accuracy": 0.6866041421890259, + "num_tokens": 19460744.0, + "step": 763 + }, + { + "epoch": 0.08390072479683724, + "grad_norm": 1.9084213972091675, + "learning_rate": 4.1877058177826565e-07, + "loss": 1.0827, + "mean_token_accuracy": 0.6773413419723511, + "num_tokens": 19496353.0, + "step": 764 + }, + { + "epoch": 0.0840105424994509, + "grad_norm": 2.336771011352539, + "learning_rate": 4.193194291986827e-07, + "loss": 1.1068, + "mean_token_accuracy": 0.6765444874763489, + "num_tokens": 19529176.0, + "step": 765 + }, + { + "epoch": 0.08412036020206458, + "grad_norm": 2.47927188873291, + "learning_rate": 4.1986827661909983e-07, + "loss": 1.0546, + "mean_token_accuracy": 0.6843141913414001, + "num_tokens": 19554137.0, + "step": 766 + }, + { + "epoch": 0.08423017790467824, + "grad_norm": 2.281764030456543, + "learning_rate": 4.20417124039517e-07, + "loss": 1.1325, + "mean_token_accuracy": 0.6685166358947754, + "num_tokens": 19581622.0, + "step": 767 + }, + { + "epoch": 0.0843399956072919, + "grad_norm": 2.2170987129211426, + "learning_rate": 4.209659714599341e-07, + "loss": 1.1295, + "mean_token_accuracy": 0.6683709621429443, + "num_tokens": 19611291.0, + "step": 768 + }, + { + "epoch": 0.08444981330990556, + "grad_norm": 2.2353904247283936, + "learning_rate": 4.2151481888035123e-07, + "loss": 1.0929, + "mean_token_accuracy": 0.6793806552886963, + "num_tokens": 19639626.0, + "step": 769 + }, + { + "epoch": 0.08455963101251922, + "grad_norm": 2.491361618041992, + "learning_rate": 4.2206366630076834e-07, + "loss": 1.1977, + "mean_token_accuracy": 0.6566319465637207, + "num_tokens": 19666000.0, + "step": 770 + }, + { + "epoch": 0.08466944871513288, + "grad_norm": 2.2664055824279785, + "learning_rate": 4.226125137211855e-07, + "loss": 1.1645, + "mean_token_accuracy": 0.6706593632698059, + "num_tokens": 19694951.0, + "step": 771 + }, + { + "epoch": 0.08477926641774654, + "grad_norm": 3.0392167568206787, + "learning_rate": 4.231613611416026e-07, + "loss": 1.0191, + "mean_token_accuracy": 0.6942958235740662, + "num_tokens": 19713158.0, + "step": 772 + }, + { + "epoch": 0.0848890841203602, + "grad_norm": 2.135924816131592, + "learning_rate": 4.2371020856201974e-07, + "loss": 1.0651, + "mean_token_accuracy": 0.6867915391921997, + "num_tokens": 19746011.0, + "step": 773 + }, + { + "epoch": 0.08499890182297386, + "grad_norm": 2.5135531425476074, + "learning_rate": 4.242590559824369e-07, + "loss": 1.1977, + "mean_token_accuracy": 0.6562327742576599, + "num_tokens": 19772822.0, + "step": 774 + }, + { + "epoch": 0.08510871952558753, + "grad_norm": 2.2285377979278564, + "learning_rate": 4.2480790340285397e-07, + "loss": 1.1964, + "mean_token_accuracy": 0.6582512855529785, + "num_tokens": 19800893.0, + "step": 775 + }, + { + "epoch": 0.08521853722820119, + "grad_norm": 2.5279152393341064, + "learning_rate": 4.253567508232711e-07, + "loss": 1.1316, + "mean_token_accuracy": 0.6721398830413818, + "num_tokens": 19824409.0, + "step": 776 + }, + { + "epoch": 0.08532835493081485, + "grad_norm": 2.2875406742095947, + "learning_rate": 4.259055982436882e-07, + "loss": 1.179, + "mean_token_accuracy": 0.6612915992736816, + "num_tokens": 19851790.0, + "step": 777 + }, + { + "epoch": 0.08543817263342851, + "grad_norm": 2.337909698486328, + "learning_rate": 4.2645444566410537e-07, + "loss": 1.0921, + "mean_token_accuracy": 0.6883993744850159, + "num_tokens": 19877020.0, + "step": 778 + }, + { + "epoch": 0.08554799033604217, + "grad_norm": 2.202646493911743, + "learning_rate": 4.270032930845225e-07, + "loss": 1.1178, + "mean_token_accuracy": 0.6746418476104736, + "num_tokens": 19906719.0, + "step": 779 + }, + { + "epoch": 0.08565780803865583, + "grad_norm": 2.2627570629119873, + "learning_rate": 4.275521405049396e-07, + "loss": 1.1368, + "mean_token_accuracy": 0.6695615649223328, + "num_tokens": 19935107.0, + "step": 780 + }, + { + "epoch": 0.08576762574126949, + "grad_norm": 2.5313589572906494, + "learning_rate": 4.281009879253567e-07, + "loss": 1.0141, + "mean_token_accuracy": 0.6980851888656616, + "num_tokens": 19958846.0, + "step": 781 + }, + { + "epoch": 0.08587744344388315, + "grad_norm": 2.328267812728882, + "learning_rate": 4.286498353457739e-07, + "loss": 1.1101, + "mean_token_accuracy": 0.6734846830368042, + "num_tokens": 19984746.0, + "step": 782 + }, + { + "epoch": 0.08598726114649681, + "grad_norm": 2.4663138389587402, + "learning_rate": 4.29198682766191e-07, + "loss": 1.0632, + "mean_token_accuracy": 0.6843123435974121, + "num_tokens": 20011571.0, + "step": 783 + }, + { + "epoch": 0.08609707884911047, + "grad_norm": 2.2176716327667236, + "learning_rate": 4.2974753018660806e-07, + "loss": 1.1461, + "mean_token_accuracy": 0.6740260720252991, + "num_tokens": 20039678.0, + "step": 784 + }, + { + "epoch": 0.08620689655172414, + "grad_norm": 2.731677770614624, + "learning_rate": 4.3029637760702523e-07, + "loss": 1.1576, + "mean_token_accuracy": 0.6658419370651245, + "num_tokens": 20062458.0, + "step": 785 + }, + { + "epoch": 0.0863167142543378, + "grad_norm": 2.6647047996520996, + "learning_rate": 4.3084522502744235e-07, + "loss": 1.0392, + "mean_token_accuracy": 0.6919975876808167, + "num_tokens": 20084436.0, + "step": 786 + }, + { + "epoch": 0.08642653195695146, + "grad_norm": 2.5955183506011963, + "learning_rate": 4.3139407244785946e-07, + "loss": 1.0791, + "mean_token_accuracy": 0.6925190687179565, + "num_tokens": 20107603.0, + "step": 787 + }, + { + "epoch": 0.08653634965956512, + "grad_norm": 2.6091392040252686, + "learning_rate": 4.319429198682766e-07, + "loss": 1.1487, + "mean_token_accuracy": 0.6592194437980652, + "num_tokens": 20131749.0, + "step": 788 + }, + { + "epoch": 0.08664616736217878, + "grad_norm": 2.540388822555542, + "learning_rate": 4.3249176728869374e-07, + "loss": 1.1124, + "mean_token_accuracy": 0.674974799156189, + "num_tokens": 20157369.0, + "step": 789 + }, + { + "epoch": 0.08675598506479244, + "grad_norm": 2.331923246383667, + "learning_rate": 4.3304061470911086e-07, + "loss": 1.0545, + "mean_token_accuracy": 0.6883965134620667, + "num_tokens": 20183423.0, + "step": 790 + }, + { + "epoch": 0.0868658027674061, + "grad_norm": 2.441518783569336, + "learning_rate": 4.33589462129528e-07, + "loss": 1.103, + "mean_token_accuracy": 0.6722874045372009, + "num_tokens": 20208588.0, + "step": 791 + }, + { + "epoch": 0.08697562047001976, + "grad_norm": 2.374776601791382, + "learning_rate": 4.341383095499451e-07, + "loss": 1.0836, + "mean_token_accuracy": 0.6809131503105164, + "num_tokens": 20233666.0, + "step": 792 + }, + { + "epoch": 0.08708543817263342, + "grad_norm": 2.6042184829711914, + "learning_rate": 4.3468715697036226e-07, + "loss": 1.0536, + "mean_token_accuracy": 0.6916967034339905, + "num_tokens": 20256628.0, + "step": 793 + }, + { + "epoch": 0.0871952558752471, + "grad_norm": 2.141692638397217, + "learning_rate": 4.352360043907793e-07, + "loss": 1.0382, + "mean_token_accuracy": 0.6893991231918335, + "num_tokens": 20286127.0, + "step": 794 + }, + { + "epoch": 0.08730507357786076, + "grad_norm": 2.100956678390503, + "learning_rate": 4.3578485181119644e-07, + "loss": 1.1055, + "mean_token_accuracy": 0.6672685146331787, + "num_tokens": 20318656.0, + "step": 795 + }, + { + "epoch": 0.08741489128047442, + "grad_norm": 2.442176342010498, + "learning_rate": 4.363336992316136e-07, + "loss": 1.0626, + "mean_token_accuracy": 0.6852571368217468, + "num_tokens": 20341849.0, + "step": 796 + }, + { + "epoch": 0.08752470898308808, + "grad_norm": 2.172957420349121, + "learning_rate": 4.368825466520307e-07, + "loss": 1.1233, + "mean_token_accuracy": 0.6714650392532349, + "num_tokens": 20373454.0, + "step": 797 + }, + { + "epoch": 0.08763452668570174, + "grad_norm": 2.7446584701538086, + "learning_rate": 4.3743139407244783e-07, + "loss": 1.0831, + "mean_token_accuracy": 0.6808466911315918, + "num_tokens": 20395859.0, + "step": 798 + }, + { + "epoch": 0.0877443443883154, + "grad_norm": 2.492501735687256, + "learning_rate": 4.3798024149286495e-07, + "loss": 1.142, + "mean_token_accuracy": 0.6663615703582764, + "num_tokens": 20420523.0, + "step": 799 + }, + { + "epoch": 0.08785416209092906, + "grad_norm": 2.659414291381836, + "learning_rate": 4.385290889132821e-07, + "loss": 1.0645, + "mean_token_accuracy": 0.6869515180587769, + "num_tokens": 20442950.0, + "step": 800 + }, + { + "epoch": 0.08796397979354272, + "grad_norm": 2.0538434982299805, + "learning_rate": 4.3907793633369923e-07, + "loss": 1.0106, + "mean_token_accuracy": 0.6950443983078003, + "num_tokens": 20473684.0, + "step": 801 + }, + { + "epoch": 0.08807379749615638, + "grad_norm": 2.4435315132141113, + "learning_rate": 4.3962678375411635e-07, + "loss": 1.0262, + "mean_token_accuracy": 0.6927362084388733, + "num_tokens": 20498878.0, + "step": 802 + }, + { + "epoch": 0.08818361519877004, + "grad_norm": 2.523648977279663, + "learning_rate": 4.401756311745334e-07, + "loss": 1.094, + "mean_token_accuracy": 0.6698877215385437, + "num_tokens": 20523813.0, + "step": 803 + }, + { + "epoch": 0.08829343290138371, + "grad_norm": 2.4197185039520264, + "learning_rate": 4.407244785949506e-07, + "loss": 1.0186, + "mean_token_accuracy": 0.6906896829605103, + "num_tokens": 20549109.0, + "step": 804 + }, + { + "epoch": 0.08840325060399737, + "grad_norm": 2.731605052947998, + "learning_rate": 4.412733260153677e-07, + "loss": 1.1342, + "mean_token_accuracy": 0.6707794666290283, + "num_tokens": 20572773.0, + "step": 805 + }, + { + "epoch": 0.08851306830661103, + "grad_norm": 2.7125368118286133, + "learning_rate": 4.418221734357848e-07, + "loss": 1.1108, + "mean_token_accuracy": 0.671788215637207, + "num_tokens": 20595427.0, + "step": 806 + }, + { + "epoch": 0.08862288600922469, + "grad_norm": 2.44482159614563, + "learning_rate": 4.42371020856202e-07, + "loss": 1.0391, + "mean_token_accuracy": 0.67911297082901, + "num_tokens": 20619889.0, + "step": 807 + }, + { + "epoch": 0.08873270371183835, + "grad_norm": 2.3583242893218994, + "learning_rate": 4.429198682766191e-07, + "loss": 1.0861, + "mean_token_accuracy": 0.6732698082923889, + "num_tokens": 20647191.0, + "step": 808 + }, + { + "epoch": 0.08884252141445201, + "grad_norm": 2.566173791885376, + "learning_rate": 4.434687156970362e-07, + "loss": 0.9253, + "mean_token_accuracy": 0.716602623462677, + "num_tokens": 20669889.0, + "step": 809 + }, + { + "epoch": 0.08895233911706567, + "grad_norm": 2.3785908222198486, + "learning_rate": 4.440175631174533e-07, + "loss": 1.1055, + "mean_token_accuracy": 0.6788649559020996, + "num_tokens": 20696121.0, + "step": 810 + }, + { + "epoch": 0.08906215681967933, + "grad_norm": 2.504185676574707, + "learning_rate": 4.445664105378705e-07, + "loss": 1.1165, + "mean_token_accuracy": 0.6675536036491394, + "num_tokens": 20722307.0, + "step": 811 + }, + { + "epoch": 0.08917197452229299, + "grad_norm": 2.3821816444396973, + "learning_rate": 4.451152579582876e-07, + "loss": 1.0228, + "mean_token_accuracy": 0.7043256759643555, + "num_tokens": 20748929.0, + "step": 812 + }, + { + "epoch": 0.08928179222490666, + "grad_norm": 2.4712159633636475, + "learning_rate": 4.4566410537870467e-07, + "loss": 1.0156, + "mean_token_accuracy": 0.699052095413208, + "num_tokens": 20771552.0, + "step": 813 + }, + { + "epoch": 0.08939160992752032, + "grad_norm": 2.597425937652588, + "learning_rate": 4.462129527991218e-07, + "loss": 1.1455, + "mean_token_accuracy": 0.6734259724617004, + "num_tokens": 20793223.0, + "step": 814 + }, + { + "epoch": 0.08950142763013398, + "grad_norm": 2.401146650314331, + "learning_rate": 4.4676180021953895e-07, + "loss": 1.1467, + "mean_token_accuracy": 0.6638538837432861, + "num_tokens": 20819452.0, + "step": 815 + }, + { + "epoch": 0.08961124533274764, + "grad_norm": 2.5985970497131348, + "learning_rate": 4.4731064763995607e-07, + "loss": 0.9617, + "mean_token_accuracy": 0.7071050405502319, + "num_tokens": 20840869.0, + "step": 816 + }, + { + "epoch": 0.0897210630353613, + "grad_norm": 2.853729009628296, + "learning_rate": 4.478594950603732e-07, + "loss": 0.9837, + "mean_token_accuracy": 0.7055851221084595, + "num_tokens": 20858501.0, + "step": 817 + }, + { + "epoch": 0.08983088073797496, + "grad_norm": 2.4571354389190674, + "learning_rate": 4.4840834248079035e-07, + "loss": 1.04, + "mean_token_accuracy": 0.6962411999702454, + "num_tokens": 20883524.0, + "step": 818 + }, + { + "epoch": 0.08994069844058862, + "grad_norm": 2.657297372817993, + "learning_rate": 4.4895718990120747e-07, + "loss": 1.0879, + "mean_token_accuracy": 0.6806696653366089, + "num_tokens": 20905206.0, + "step": 819 + }, + { + "epoch": 0.09005051614320228, + "grad_norm": 2.1412813663482666, + "learning_rate": 4.495060373216246e-07, + "loss": 1.109, + "mean_token_accuracy": 0.6740327477455139, + "num_tokens": 20934661.0, + "step": 820 + }, + { + "epoch": 0.09016033384581594, + "grad_norm": 2.516944169998169, + "learning_rate": 4.500548847420417e-07, + "loss": 1.1635, + "mean_token_accuracy": 0.6641197800636292, + "num_tokens": 20959503.0, + "step": 821 + }, + { + "epoch": 0.0902701515484296, + "grad_norm": 2.488072156906128, + "learning_rate": 4.5060373216245886e-07, + "loss": 0.997, + "mean_token_accuracy": 0.6992113590240479, + "num_tokens": 20981865.0, + "step": 822 + }, + { + "epoch": 0.09037996925104327, + "grad_norm": 2.2224531173706055, + "learning_rate": 4.5115257958287593e-07, + "loss": 1.1284, + "mean_token_accuracy": 0.672693133354187, + "num_tokens": 21010929.0, + "step": 823 + }, + { + "epoch": 0.09048978695365693, + "grad_norm": 2.503509759902954, + "learning_rate": 4.5170142700329304e-07, + "loss": 1.0812, + "mean_token_accuracy": 0.680023729801178, + "num_tokens": 21036157.0, + "step": 824 + }, + { + "epoch": 0.0905996046562706, + "grad_norm": 2.3906400203704834, + "learning_rate": 4.5225027442371016e-07, + "loss": 1.1561, + "mean_token_accuracy": 0.6644771695137024, + "num_tokens": 21062843.0, + "step": 825 + }, + { + "epoch": 0.09070942235888425, + "grad_norm": 2.229311466217041, + "learning_rate": 4.527991218441273e-07, + "loss": 1.092, + "mean_token_accuracy": 0.6777641177177429, + "num_tokens": 21090617.0, + "step": 826 + }, + { + "epoch": 0.09081924006149791, + "grad_norm": 2.3913965225219727, + "learning_rate": 4.5334796926454444e-07, + "loss": 1.1353, + "mean_token_accuracy": 0.6654020547866821, + "num_tokens": 21114807.0, + "step": 827 + }, + { + "epoch": 0.09092905776411157, + "grad_norm": 2.1957571506500244, + "learning_rate": 4.5389681668496156e-07, + "loss": 1.1885, + "mean_token_accuracy": 0.6614028215408325, + "num_tokens": 21144742.0, + "step": 828 + }, + { + "epoch": 0.09103887546672523, + "grad_norm": 2.599551200866699, + "learning_rate": 4.5444566410537867e-07, + "loss": 1.082, + "mean_token_accuracy": 0.6779563426971436, + "num_tokens": 21167369.0, + "step": 829 + }, + { + "epoch": 0.09114869316933889, + "grad_norm": 2.464461088180542, + "learning_rate": 4.5499451152579584e-07, + "loss": 1.0834, + "mean_token_accuracy": 0.6879395246505737, + "num_tokens": 21190162.0, + "step": 830 + }, + { + "epoch": 0.09125851087195255, + "grad_norm": 2.430582284927368, + "learning_rate": 4.5554335894621295e-07, + "loss": 1.0896, + "mean_token_accuracy": 0.6705799698829651, + "num_tokens": 21214006.0, + "step": 831 + }, + { + "epoch": 0.09136832857456623, + "grad_norm": 2.1939585208892822, + "learning_rate": 4.5609220636663e-07, + "loss": 1.138, + "mean_token_accuracy": 0.6674988269805908, + "num_tokens": 21244401.0, + "step": 832 + }, + { + "epoch": 0.09147814627717989, + "grad_norm": 2.725870370864868, + "learning_rate": 4.566410537870472e-07, + "loss": 1.0554, + "mean_token_accuracy": 0.6841157078742981, + "num_tokens": 21264997.0, + "step": 833 + }, + { + "epoch": 0.09158796397979355, + "grad_norm": 2.50852370262146, + "learning_rate": 4.571899012074643e-07, + "loss": 1.0204, + "mean_token_accuracy": 0.692507266998291, + "num_tokens": 21286021.0, + "step": 834 + }, + { + "epoch": 0.0916977816824072, + "grad_norm": 2.3121020793914795, + "learning_rate": 4.577387486278814e-07, + "loss": 1.0503, + "mean_token_accuracy": 0.6880097985267639, + "num_tokens": 21312713.0, + "step": 835 + }, + { + "epoch": 0.09180759938502087, + "grad_norm": 2.3157474994659424, + "learning_rate": 4.5828759604829853e-07, + "loss": 1.012, + "mean_token_accuracy": 0.7024015784263611, + "num_tokens": 21336880.0, + "step": 836 + }, + { + "epoch": 0.09191741708763453, + "grad_norm": 2.680346965789795, + "learning_rate": 4.588364434687157e-07, + "loss": 1.0369, + "mean_token_accuracy": 0.693566083908081, + "num_tokens": 21356570.0, + "step": 837 + }, + { + "epoch": 0.09202723479024819, + "grad_norm": 2.6614346504211426, + "learning_rate": 4.593852908891328e-07, + "loss": 1.126, + "mean_token_accuracy": 0.6675683259963989, + "num_tokens": 21377835.0, + "step": 838 + }, + { + "epoch": 0.09213705249286185, + "grad_norm": 2.494338274002075, + "learning_rate": 4.5993413830954993e-07, + "loss": 1.0336, + "mean_token_accuracy": 0.6987466812133789, + "num_tokens": 21401400.0, + "step": 839 + }, + { + "epoch": 0.0922468701954755, + "grad_norm": 2.220451593399048, + "learning_rate": 4.6048298572996704e-07, + "loss": 1.119, + "mean_token_accuracy": 0.6748887300491333, + "num_tokens": 21431853.0, + "step": 840 + }, + { + "epoch": 0.09235668789808917, + "grad_norm": 3.036910057067871, + "learning_rate": 4.6103183315038416e-07, + "loss": 0.982, + "mean_token_accuracy": 0.7131133079528809, + "num_tokens": 21450719.0, + "step": 841 + }, + { + "epoch": 0.09246650560070284, + "grad_norm": 2.331225633621216, + "learning_rate": 4.615806805708013e-07, + "loss": 1.1038, + "mean_token_accuracy": 0.681930422782898, + "num_tokens": 21477094.0, + "step": 842 + }, + { + "epoch": 0.0925763233033165, + "grad_norm": 2.0639548301696777, + "learning_rate": 4.621295279912184e-07, + "loss": 1.1642, + "mean_token_accuracy": 0.663665771484375, + "num_tokens": 21509266.0, + "step": 843 + }, + { + "epoch": 0.09268614100593016, + "grad_norm": 2.582432270050049, + "learning_rate": 4.6267837541163556e-07, + "loss": 1.0366, + "mean_token_accuracy": 0.6963212490081787, + "num_tokens": 21531535.0, + "step": 844 + }, + { + "epoch": 0.09279595870854382, + "grad_norm": 2.5975563526153564, + "learning_rate": 4.632272228320527e-07, + "loss": 1.0263, + "mean_token_accuracy": 0.6982468366622925, + "num_tokens": 21553291.0, + "step": 845 + }, + { + "epoch": 0.09290577641115748, + "grad_norm": 2.1986966133117676, + "learning_rate": 4.637760702524698e-07, + "loss": 1.0372, + "mean_token_accuracy": 0.6986101865768433, + "num_tokens": 21581993.0, + "step": 846 + }, + { + "epoch": 0.09301559411377114, + "grad_norm": 2.625957489013672, + "learning_rate": 4.643249176728869e-07, + "loss": 1.0874, + "mean_token_accuracy": 0.6810996532440186, + "num_tokens": 21603419.0, + "step": 847 + }, + { + "epoch": 0.0931254118163848, + "grad_norm": 2.505964994430542, + "learning_rate": 4.6487376509330407e-07, + "loss": 1.1542, + "mean_token_accuracy": 0.659799337387085, + "num_tokens": 21628388.0, + "step": 848 + }, + { + "epoch": 0.09323522951899846, + "grad_norm": 2.5354058742523193, + "learning_rate": 4.654226125137212e-07, + "loss": 1.0454, + "mean_token_accuracy": 0.68974369764328, + "num_tokens": 21649897.0, + "step": 849 + }, + { + "epoch": 0.09334504722161212, + "grad_norm": 2.8382458686828613, + "learning_rate": 4.659714599341383e-07, + "loss": 1.0182, + "mean_token_accuracy": 0.6882652044296265, + "num_tokens": 21667817.0, + "step": 850 + }, + { + "epoch": 0.09345486492422579, + "grad_norm": 2.269512891769409, + "learning_rate": 4.6652030735455537e-07, + "loss": 1.0713, + "mean_token_accuracy": 0.6838486194610596, + "num_tokens": 21695268.0, + "step": 851 + }, + { + "epoch": 0.09356468262683945, + "grad_norm": 2.2475054264068604, + "learning_rate": 4.6706915477497253e-07, + "loss": 1.0211, + "mean_token_accuracy": 0.694716215133667, + "num_tokens": 21726268.0, + "step": 852 + }, + { + "epoch": 0.09367450032945311, + "grad_norm": 2.374817371368408, + "learning_rate": 4.6761800219538965e-07, + "loss": 1.0967, + "mean_token_accuracy": 0.6733474135398865, + "num_tokens": 21752212.0, + "step": 853 + }, + { + "epoch": 0.09378431803206677, + "grad_norm": 2.5472095012664795, + "learning_rate": 4.6816684961580676e-07, + "loss": 1.0769, + "mean_token_accuracy": 0.686677098274231, + "num_tokens": 21773790.0, + "step": 854 + }, + { + "epoch": 0.09389413573468043, + "grad_norm": 2.4536962509155273, + "learning_rate": 4.6871569703622393e-07, + "loss": 1.0271, + "mean_token_accuracy": 0.6938968896865845, + "num_tokens": 21796604.0, + "step": 855 + }, + { + "epoch": 0.09400395343729409, + "grad_norm": 2.34744930267334, + "learning_rate": 4.6926454445664105e-07, + "loss": 1.0824, + "mean_token_accuracy": 0.6795163750648499, + "num_tokens": 21821705.0, + "step": 856 + }, + { + "epoch": 0.09411377113990775, + "grad_norm": 2.242234706878662, + "learning_rate": 4.6981339187705816e-07, + "loss": 1.0539, + "mean_token_accuracy": 0.690005898475647, + "num_tokens": 21847939.0, + "step": 857 + }, + { + "epoch": 0.09422358884252141, + "grad_norm": 2.3917508125305176, + "learning_rate": 4.703622392974753e-07, + "loss": 1.0276, + "mean_token_accuracy": 0.690704882144928, + "num_tokens": 21872343.0, + "step": 858 + }, + { + "epoch": 0.09433340654513507, + "grad_norm": 2.3187525272369385, + "learning_rate": 4.7091108671789245e-07, + "loss": 1.0471, + "mean_token_accuracy": 0.6895471811294556, + "num_tokens": 21896731.0, + "step": 859 + }, + { + "epoch": 0.09444322424774873, + "grad_norm": 2.210174798965454, + "learning_rate": 4.714599341383095e-07, + "loss": 1.1004, + "mean_token_accuracy": 0.675218403339386, + "num_tokens": 21924183.0, + "step": 860 + }, + { + "epoch": 0.0945530419503624, + "grad_norm": 2.4128949642181396, + "learning_rate": 4.720087815587266e-07, + "loss": 1.0932, + "mean_token_accuracy": 0.6734756231307983, + "num_tokens": 21947539.0, + "step": 861 + }, + { + "epoch": 0.09466285965297606, + "grad_norm": 2.257687568664551, + "learning_rate": 4.7255762897914374e-07, + "loss": 1.0601, + "mean_token_accuracy": 0.6899541020393372, + "num_tokens": 21974500.0, + "step": 862 + }, + { + "epoch": 0.09477267735558972, + "grad_norm": 2.5571765899658203, + "learning_rate": 4.731064763995609e-07, + "loss": 1.0865, + "mean_token_accuracy": 0.6876171231269836, + "num_tokens": 21997519.0, + "step": 863 + }, + { + "epoch": 0.09488249505820338, + "grad_norm": 2.2026169300079346, + "learning_rate": 4.73655323819978e-07, + "loss": 1.027, + "mean_token_accuracy": 0.695209801197052, + "num_tokens": 22025694.0, + "step": 864 + }, + { + "epoch": 0.09499231276081704, + "grad_norm": 1.9952173233032227, + "learning_rate": 4.7420417124039514e-07, + "loss": 1.1221, + "mean_token_accuracy": 0.6661862730979919, + "num_tokens": 22062898.0, + "step": 865 + }, + { + "epoch": 0.0951021304634307, + "grad_norm": 2.569812774658203, + "learning_rate": 4.747530186608123e-07, + "loss": 0.9903, + "mean_token_accuracy": 0.7012958526611328, + "num_tokens": 22084568.0, + "step": 866 + }, + { + "epoch": 0.09521194816604436, + "grad_norm": 2.39729905128479, + "learning_rate": 4.753018660812294e-07, + "loss": 0.9873, + "mean_token_accuracy": 0.6985933780670166, + "num_tokens": 22106669.0, + "step": 867 + }, + { + "epoch": 0.09532176586865802, + "grad_norm": 2.4531805515289307, + "learning_rate": 4.7585071350164654e-07, + "loss": 0.9746, + "mean_token_accuracy": 0.7040457725524902, + "num_tokens": 22131831.0, + "step": 868 + }, + { + "epoch": 0.09543158357127168, + "grad_norm": 2.1182141304016113, + "learning_rate": 4.7639956092206365e-07, + "loss": 1.018, + "mean_token_accuracy": 0.7036113142967224, + "num_tokens": 22160780.0, + "step": 869 + }, + { + "epoch": 0.09554140127388536, + "grad_norm": 2.558516025543213, + "learning_rate": 4.769484083424808e-07, + "loss": 1.1121, + "mean_token_accuracy": 0.6755514740943909, + "num_tokens": 22183729.0, + "step": 870 + }, + { + "epoch": 0.09565121897649902, + "grad_norm": 2.8594632148742676, + "learning_rate": 4.774972557628979e-07, + "loss": 1.0698, + "mean_token_accuracy": 0.6772443652153015, + "num_tokens": 22201309.0, + "step": 871 + }, + { + "epoch": 0.09576103667911268, + "grad_norm": 2.243769645690918, + "learning_rate": 4.78046103183315e-07, + "loss": 1.1526, + "mean_token_accuracy": 0.6667280197143555, + "num_tokens": 22229493.0, + "step": 872 + }, + { + "epoch": 0.09587085438172634, + "grad_norm": 2.730502128601074, + "learning_rate": 4.785949506037321e-07, + "loss": 1.0343, + "mean_token_accuracy": 0.692206621170044, + "num_tokens": 22250042.0, + "step": 873 + }, + { + "epoch": 0.09598067208434, + "grad_norm": 2.5726208686828613, + "learning_rate": 4.791437980241493e-07, + "loss": 1.0699, + "mean_token_accuracy": 0.6786489486694336, + "num_tokens": 22272126.0, + "step": 874 + }, + { + "epoch": 0.09609048978695366, + "grad_norm": 2.685795783996582, + "learning_rate": 4.796926454445663e-07, + "loss": 1.0352, + "mean_token_accuracy": 0.690228283405304, + "num_tokens": 22293041.0, + "step": 875 + }, + { + "epoch": 0.09620030748956732, + "grad_norm": 2.5280518531799316, + "learning_rate": 4.802414928649835e-07, + "loss": 1.0573, + "mean_token_accuracy": 0.6906599402427673, + "num_tokens": 22315271.0, + "step": 876 + }, + { + "epoch": 0.09631012519218098, + "grad_norm": 2.245840311050415, + "learning_rate": 4.807903402854007e-07, + "loss": 1.0192, + "mean_token_accuracy": 0.6915353536605835, + "num_tokens": 22342118.0, + "step": 877 + }, + { + "epoch": 0.09641994289479464, + "grad_norm": 2.5079405307769775, + "learning_rate": 4.813391877058177e-07, + "loss": 1.0108, + "mean_token_accuracy": 0.701698899269104, + "num_tokens": 22365505.0, + "step": 878 + }, + { + "epoch": 0.0965297605974083, + "grad_norm": 2.475503921508789, + "learning_rate": 4.818880351262349e-07, + "loss": 1.0559, + "mean_token_accuracy": 0.6806238889694214, + "num_tokens": 22390505.0, + "step": 879 + }, + { + "epoch": 0.09663957830002197, + "grad_norm": 2.5448241233825684, + "learning_rate": 4.82436882546652e-07, + "loss": 1.0806, + "mean_token_accuracy": 0.683830738067627, + "num_tokens": 22412862.0, + "step": 880 + }, + { + "epoch": 0.09674939600263563, + "grad_norm": 2.6540122032165527, + "learning_rate": 4.829857299670691e-07, + "loss": 1.0723, + "mean_token_accuracy": 0.6799889802932739, + "num_tokens": 22433232.0, + "step": 881 + }, + { + "epoch": 0.09685921370524929, + "grad_norm": 2.2732763290405273, + "learning_rate": 4.835345773874863e-07, + "loss": 1.0381, + "mean_token_accuracy": 0.6928258538246155, + "num_tokens": 22461670.0, + "step": 882 + }, + { + "epoch": 0.09696903140786295, + "grad_norm": 2.2094473838806152, + "learning_rate": 4.840834248079034e-07, + "loss": 1.0453, + "mean_token_accuracy": 0.690216064453125, + "num_tokens": 22491209.0, + "step": 883 + }, + { + "epoch": 0.09707884911047661, + "grad_norm": 2.652081251144409, + "learning_rate": 4.846322722283204e-07, + "loss": 1.008, + "mean_token_accuracy": 0.707328200340271, + "num_tokens": 22512920.0, + "step": 884 + }, + { + "epoch": 0.09718866681309027, + "grad_norm": 2.351714611053467, + "learning_rate": 4.851811196487376e-07, + "loss": 0.9981, + "mean_token_accuracy": 0.6990324258804321, + "num_tokens": 22538883.0, + "step": 885 + }, + { + "epoch": 0.09729848451570393, + "grad_norm": 2.0844101905822754, + "learning_rate": 4.857299670691548e-07, + "loss": 1.0622, + "mean_token_accuracy": 0.6900792121887207, + "num_tokens": 22569204.0, + "step": 886 + }, + { + "epoch": 0.09740830221831759, + "grad_norm": 2.9231839179992676, + "learning_rate": 4.862788144895718e-07, + "loss": 1.0936, + "mean_token_accuracy": 0.6801368594169617, + "num_tokens": 22589767.0, + "step": 887 + }, + { + "epoch": 0.09751811992093125, + "grad_norm": 2.2764976024627686, + "learning_rate": 4.86827661909989e-07, + "loss": 1.0318, + "mean_token_accuracy": 0.6908450126647949, + "num_tokens": 22617792.0, + "step": 888 + }, + { + "epoch": 0.09762793762354492, + "grad_norm": 2.3881771564483643, + "learning_rate": 4.873765093304062e-07, + "loss": 0.9657, + "mean_token_accuracy": 0.7152634859085083, + "num_tokens": 22643947.0, + "step": 889 + }, + { + "epoch": 0.09773775532615858, + "grad_norm": 2.479783296585083, + "learning_rate": 4.879253567508232e-07, + "loss": 0.977, + "mean_token_accuracy": 0.7008132934570312, + "num_tokens": 22667462.0, + "step": 890 + }, + { + "epoch": 0.09784757302877224, + "grad_norm": 2.338757038116455, + "learning_rate": 4.884742041712404e-07, + "loss": 1.0875, + "mean_token_accuracy": 0.6888253688812256, + "num_tokens": 22693625.0, + "step": 891 + }, + { + "epoch": 0.0979573907313859, + "grad_norm": 2.235239028930664, + "learning_rate": 4.890230515916576e-07, + "loss": 1.0931, + "mean_token_accuracy": 0.6783971786499023, + "num_tokens": 22720455.0, + "step": 892 + }, + { + "epoch": 0.09806720843399956, + "grad_norm": 2.213920831680298, + "learning_rate": 4.895718990120746e-07, + "loss": 1.2263, + "mean_token_accuracy": 0.6470769643783569, + "num_tokens": 22751309.0, + "step": 893 + }, + { + "epoch": 0.09817702613661322, + "grad_norm": 2.368777275085449, + "learning_rate": 4.901207464324917e-07, + "loss": 1.023, + "mean_token_accuracy": 0.6964489221572876, + "num_tokens": 22775840.0, + "step": 894 + }, + { + "epoch": 0.09828684383922688, + "grad_norm": 2.7569477558135986, + "learning_rate": 4.906695938529089e-07, + "loss": 1.0677, + "mean_token_accuracy": 0.6841229796409607, + "num_tokens": 22796957.0, + "step": 895 + }, + { + "epoch": 0.09839666154184054, + "grad_norm": 2.4636781215667725, + "learning_rate": 4.91218441273326e-07, + "loss": 0.9267, + "mean_token_accuracy": 0.7232354879379272, + "num_tokens": 22819971.0, + "step": 896 + }, + { + "epoch": 0.0985064792444542, + "grad_norm": 2.2907166481018066, + "learning_rate": 4.917672886937431e-07, + "loss": 1.0566, + "mean_token_accuracy": 0.6842471361160278, + "num_tokens": 22846277.0, + "step": 897 + }, + { + "epoch": 0.09861629694706786, + "grad_norm": 2.4064266681671143, + "learning_rate": 4.923161361141603e-07, + "loss": 1.088, + "mean_token_accuracy": 0.6731777191162109, + "num_tokens": 22870648.0, + "step": 898 + }, + { + "epoch": 0.09872611464968153, + "grad_norm": 2.4706153869628906, + "learning_rate": 4.928649835345773e-07, + "loss": 1.1342, + "mean_token_accuracy": 0.6888056397438049, + "num_tokens": 22894323.0, + "step": 899 + }, + { + "epoch": 0.0988359323522952, + "grad_norm": 2.384376049041748, + "learning_rate": 4.934138309549945e-07, + "loss": 1.1351, + "mean_token_accuracy": 0.6657121181488037, + "num_tokens": 22922794.0, + "step": 900 + }, + { + "epoch": 0.09894575005490885, + "grad_norm": 2.626124143600464, + "learning_rate": 4.939626783754117e-07, + "loss": 1.104, + "mean_token_accuracy": 0.6745932102203369, + "num_tokens": 22945255.0, + "step": 901 + }, + { + "epoch": 0.09905556775752251, + "grad_norm": 2.1068923473358154, + "learning_rate": 4.945115257958287e-07, + "loss": 1.1081, + "mean_token_accuracy": 0.6798191666603088, + "num_tokens": 22977385.0, + "step": 902 + }, + { + "epoch": 0.09916538546013617, + "grad_norm": 2.373579978942871, + "learning_rate": 4.950603732162459e-07, + "loss": 0.9554, + "mean_token_accuracy": 0.7133435010910034, + "num_tokens": 23002230.0, + "step": 903 + }, + { + "epoch": 0.09927520316274983, + "grad_norm": 2.5239739418029785, + "learning_rate": 4.95609220636663e-07, + "loss": 1.0593, + "mean_token_accuracy": 0.69190514087677, + "num_tokens": 23025287.0, + "step": 904 + }, + { + "epoch": 0.0993850208653635, + "grad_norm": 2.1697559356689453, + "learning_rate": 4.961580680570801e-07, + "loss": 1.1228, + "mean_token_accuracy": 0.6732170581817627, + "num_tokens": 23055529.0, + "step": 905 + }, + { + "epoch": 0.09949483856797715, + "grad_norm": 2.344438076019287, + "learning_rate": 4.967069154774972e-07, + "loss": 1.0949, + "mean_token_accuracy": 0.6760748028755188, + "num_tokens": 23082912.0, + "step": 906 + }, + { + "epoch": 0.09960465627059081, + "grad_norm": 2.295011281967163, + "learning_rate": 4.972557628979143e-07, + "loss": 1.1059, + "mean_token_accuracy": 0.6760534048080444, + "num_tokens": 23108872.0, + "step": 907 + }, + { + "epoch": 0.09971447397320449, + "grad_norm": 2.2360308170318604, + "learning_rate": 4.978046103183315e-07, + "loss": 1.106, + "mean_token_accuracy": 0.6761080622673035, + "num_tokens": 23136510.0, + "step": 908 + }, + { + "epoch": 0.09982429167581815, + "grad_norm": 2.3154690265655518, + "learning_rate": 4.983534577387486e-07, + "loss": 1.0558, + "mean_token_accuracy": 0.6928406357765198, + "num_tokens": 23162635.0, + "step": 909 + }, + { + "epoch": 0.0999341093784318, + "grad_norm": 2.0818591117858887, + "learning_rate": 4.989023051591657e-07, + "loss": 1.1135, + "mean_token_accuracy": 0.6809166669845581, + "num_tokens": 23192544.0, + "step": 910 + }, + { + "epoch": 0.10004392708104547, + "grad_norm": 2.479823112487793, + "learning_rate": 4.994511525795829e-07, + "loss": 1.0657, + "mean_token_accuracy": 0.6796097159385681, + "num_tokens": 23216228.0, + "step": 911 + }, + { + "epoch": 0.10015374478365913, + "grad_norm": 2.1208841800689697, + "learning_rate": 5e-07, + "loss": 1.1509, + "mean_token_accuracy": 0.6665107607841492, + "num_tokens": 23246999.0, + "step": 912 + }, + { + "epoch": 0.10026356248627279, + "grad_norm": 2.6712019443511963, + "learning_rate": 5.005488474204171e-07, + "loss": 1.1067, + "mean_token_accuracy": 0.6782355308532715, + "num_tokens": 23269565.0, + "step": 913 + }, + { + "epoch": 0.10037338018888645, + "grad_norm": 2.53836727142334, + "learning_rate": 5.010976948408342e-07, + "loss": 1.0119, + "mean_token_accuracy": 0.6962478160858154, + "num_tokens": 23292565.0, + "step": 914 + }, + { + "epoch": 0.1004831978915001, + "grad_norm": 2.0749926567077637, + "learning_rate": 5.016465422612514e-07, + "loss": 1.1056, + "mean_token_accuracy": 0.6775331497192383, + "num_tokens": 23324648.0, + "step": 915 + }, + { + "epoch": 0.10059301559411377, + "grad_norm": 2.4445013999938965, + "learning_rate": 5.021953896816685e-07, + "loss": 1.0375, + "mean_token_accuracy": 0.6883593201637268, + "num_tokens": 23349041.0, + "step": 916 + }, + { + "epoch": 0.10070283329672743, + "grad_norm": 2.4497735500335693, + "learning_rate": 5.027442371020856e-07, + "loss": 1.0854, + "mean_token_accuracy": 0.6835595965385437, + "num_tokens": 23373967.0, + "step": 917 + }, + { + "epoch": 0.1008126509993411, + "grad_norm": 2.491083860397339, + "learning_rate": 5.032930845225028e-07, + "loss": 0.961, + "mean_token_accuracy": 0.710067629814148, + "num_tokens": 23397765.0, + "step": 918 + }, + { + "epoch": 0.10092246870195476, + "grad_norm": 2.2186429500579834, + "learning_rate": 5.038419319429198e-07, + "loss": 1.0683, + "mean_token_accuracy": 0.6878368854522705, + "num_tokens": 23426190.0, + "step": 919 + }, + { + "epoch": 0.10103228640456842, + "grad_norm": 2.1705820560455322, + "learning_rate": 5.04390779363337e-07, + "loss": 1.011, + "mean_token_accuracy": 0.6954101324081421, + "num_tokens": 23454297.0, + "step": 920 + }, + { + "epoch": 0.10114210410718208, + "grad_norm": 2.428990364074707, + "learning_rate": 5.049396267837542e-07, + "loss": 1.0071, + "mean_token_accuracy": 0.7006675601005554, + "num_tokens": 23478733.0, + "step": 921 + }, + { + "epoch": 0.10125192180979574, + "grad_norm": 2.371884346008301, + "learning_rate": 5.054884742041711e-07, + "loss": 1.0768, + "mean_token_accuracy": 0.6779130697250366, + "num_tokens": 23503101.0, + "step": 922 + }, + { + "epoch": 0.1013617395124094, + "grad_norm": 2.3821306228637695, + "learning_rate": 5.060373216245883e-07, + "loss": 1.0542, + "mean_token_accuracy": 0.6863312721252441, + "num_tokens": 23527782.0, + "step": 923 + }, + { + "epoch": 0.10147155721502306, + "grad_norm": 2.0404622554779053, + "learning_rate": 5.065861690450055e-07, + "loss": 1.1445, + "mean_token_accuracy": 0.6646361351013184, + "num_tokens": 23558809.0, + "step": 924 + }, + { + "epoch": 0.10158137491763672, + "grad_norm": 2.3589439392089844, + "learning_rate": 5.071350164654225e-07, + "loss": 1.1466, + "mean_token_accuracy": 0.6623813509941101, + "num_tokens": 23587521.0, + "step": 925 + }, + { + "epoch": 0.10169119262025038, + "grad_norm": 2.9559779167175293, + "learning_rate": 5.076838638858397e-07, + "loss": 1.003, + "mean_token_accuracy": 0.7049195170402527, + "num_tokens": 23605449.0, + "step": 926 + }, + { + "epoch": 0.10180101032286405, + "grad_norm": 2.5429251194000244, + "learning_rate": 5.082327113062569e-07, + "loss": 1.0119, + "mean_token_accuracy": 0.6938601732254028, + "num_tokens": 23626017.0, + "step": 927 + }, + { + "epoch": 0.10191082802547771, + "grad_norm": 2.2087841033935547, + "learning_rate": 5.087815587266739e-07, + "loss": 1.0327, + "mean_token_accuracy": 0.6883494257926941, + "num_tokens": 23654961.0, + "step": 928 + }, + { + "epoch": 0.10202064572809137, + "grad_norm": 2.5542569160461426, + "learning_rate": 5.093304061470911e-07, + "loss": 1.0716, + "mean_token_accuracy": 0.6783900260925293, + "num_tokens": 23678574.0, + "step": 929 + }, + { + "epoch": 0.10213046343070503, + "grad_norm": 2.5779616832733154, + "learning_rate": 5.098792535675082e-07, + "loss": 1.0655, + "mean_token_accuracy": 0.6907268166542053, + "num_tokens": 23702465.0, + "step": 930 + }, + { + "epoch": 0.10224028113331869, + "grad_norm": 2.3112518787384033, + "learning_rate": 5.104281009879253e-07, + "loss": 1.1201, + "mean_token_accuracy": 0.6701889634132385, + "num_tokens": 23732301.0, + "step": 931 + }, + { + "epoch": 0.10235009883593235, + "grad_norm": 2.566995620727539, + "learning_rate": 5.109769484083425e-07, + "loss": 1.126, + "mean_token_accuracy": 0.6706888675689697, + "num_tokens": 23756818.0, + "step": 932 + }, + { + "epoch": 0.10245991653854601, + "grad_norm": 2.4081950187683105, + "learning_rate": 5.115257958287596e-07, + "loss": 1.0384, + "mean_token_accuracy": 0.69234299659729, + "num_tokens": 23781666.0, + "step": 933 + }, + { + "epoch": 0.10256973424115967, + "grad_norm": 2.4305551052093506, + "learning_rate": 5.120746432491767e-07, + "loss": 1.0767, + "mean_token_accuracy": 0.6872606873512268, + "num_tokens": 23805838.0, + "step": 934 + }, + { + "epoch": 0.10267955194377333, + "grad_norm": 2.3173179626464844, + "learning_rate": 5.126234906695939e-07, + "loss": 1.1295, + "mean_token_accuracy": 0.6739983558654785, + "num_tokens": 23832519.0, + "step": 935 + }, + { + "epoch": 0.10278936964638699, + "grad_norm": 2.4574074745178223, + "learning_rate": 5.13172338090011e-07, + "loss": 1.0822, + "mean_token_accuracy": 0.6793834567070007, + "num_tokens": 23857997.0, + "step": 936 + }, + { + "epoch": 0.10289918734900066, + "grad_norm": 2.3350207805633545, + "learning_rate": 5.137211855104281e-07, + "loss": 1.0893, + "mean_token_accuracy": 0.6782150268554688, + "num_tokens": 23885290.0, + "step": 937 + }, + { + "epoch": 0.10300900505161432, + "grad_norm": 2.6374106407165527, + "learning_rate": 5.142700329308453e-07, + "loss": 1.0164, + "mean_token_accuracy": 0.694972038269043, + "num_tokens": 23906488.0, + "step": 938 + }, + { + "epoch": 0.10311882275422798, + "grad_norm": 2.8940916061401367, + "learning_rate": 5.148188803512624e-07, + "loss": 0.9329, + "mean_token_accuracy": 0.7164425253868103, + "num_tokens": 23924882.0, + "step": 939 + }, + { + "epoch": 0.10322864045684164, + "grad_norm": 2.313934326171875, + "learning_rate": 5.153677277716795e-07, + "loss": 1.0361, + "mean_token_accuracy": 0.6940897107124329, + "num_tokens": 23950403.0, + "step": 940 + }, + { + "epoch": 0.1033384581594553, + "grad_norm": 2.680572509765625, + "learning_rate": 5.159165751920965e-07, + "loss": 0.9756, + "mean_token_accuracy": 0.7040901184082031, + "num_tokens": 23972895.0, + "step": 941 + }, + { + "epoch": 0.10344827586206896, + "grad_norm": 2.1764461994171143, + "learning_rate": 5.164654226125136e-07, + "loss": 1.1567, + "mean_token_accuracy": 0.6586360931396484, + "num_tokens": 24004111.0, + "step": 942 + }, + { + "epoch": 0.10355809356468262, + "grad_norm": 2.229003667831421, + "learning_rate": 5.170142700329308e-07, + "loss": 1.1079, + "mean_token_accuracy": 0.6746417284011841, + "num_tokens": 24032498.0, + "step": 943 + }, + { + "epoch": 0.10366791126729628, + "grad_norm": 3.3275129795074463, + "learning_rate": 5.175631174533479e-07, + "loss": 1.0827, + "mean_token_accuracy": 0.6764225959777832, + "num_tokens": 24056860.0, + "step": 944 + }, + { + "epoch": 0.10377772896990994, + "grad_norm": 2.2762413024902344, + "learning_rate": 5.18111964873765e-07, + "loss": 1.0546, + "mean_token_accuracy": 0.6901707649230957, + "num_tokens": 24083768.0, + "step": 945 + }, + { + "epoch": 0.10388754667252362, + "grad_norm": 2.2272369861602783, + "learning_rate": 5.186608122941822e-07, + "loss": 0.9707, + "mean_token_accuracy": 0.7036553025245667, + "num_tokens": 24110809.0, + "step": 946 + }, + { + "epoch": 0.10399736437513728, + "grad_norm": 2.4424173831939697, + "learning_rate": 5.192096597145993e-07, + "loss": 1.0445, + "mean_token_accuracy": 0.68924480676651, + "num_tokens": 24134651.0, + "step": 947 + }, + { + "epoch": 0.10410718207775094, + "grad_norm": 2.6466081142425537, + "learning_rate": 5.197585071350164e-07, + "loss": 0.9977, + "mean_token_accuracy": 0.7009150981903076, + "num_tokens": 24156187.0, + "step": 948 + }, + { + "epoch": 0.1042169997803646, + "grad_norm": 2.494889259338379, + "learning_rate": 5.203073545554336e-07, + "loss": 1.128, + "mean_token_accuracy": 0.6616451144218445, + "num_tokens": 24178528.0, + "step": 949 + }, + { + "epoch": 0.10432681748297826, + "grad_norm": 2.162219524383545, + "learning_rate": 5.208562019758507e-07, + "loss": 1.0974, + "mean_token_accuracy": 0.6729675531387329, + "num_tokens": 24208037.0, + "step": 950 + }, + { + "epoch": 0.10443663518559192, + "grad_norm": 2.2813098430633545, + "learning_rate": 5.214050493962678e-07, + "loss": 1.1132, + "mean_token_accuracy": 0.67399001121521, + "num_tokens": 24234417.0, + "step": 951 + }, + { + "epoch": 0.10454645288820558, + "grad_norm": 2.801614761352539, + "learning_rate": 5.219538968166849e-07, + "loss": 1.0656, + "mean_token_accuracy": 0.6883898377418518, + "num_tokens": 24254723.0, + "step": 952 + }, + { + "epoch": 0.10465627059081924, + "grad_norm": 2.209245443344116, + "learning_rate": 5.225027442371021e-07, + "loss": 1.0709, + "mean_token_accuracy": 0.6925941705703735, + "num_tokens": 24281358.0, + "step": 953 + }, + { + "epoch": 0.1047660882934329, + "grad_norm": 2.432748794555664, + "learning_rate": 5.230515916575192e-07, + "loss": 0.8941, + "mean_token_accuracy": 0.7174837589263916, + "num_tokens": 24303063.0, + "step": 954 + }, + { + "epoch": 0.10487590599604656, + "grad_norm": 2.6987380981445312, + "learning_rate": 5.236004390779363e-07, + "loss": 1.001, + "mean_token_accuracy": 0.6984702348709106, + "num_tokens": 24323106.0, + "step": 955 + }, + { + "epoch": 0.10498572369866023, + "grad_norm": 2.4256951808929443, + "learning_rate": 5.241492864983535e-07, + "loss": 1.0679, + "mean_token_accuracy": 0.688035249710083, + "num_tokens": 24348199.0, + "step": 956 + }, + { + "epoch": 0.10509554140127389, + "grad_norm": 2.391448736190796, + "learning_rate": 5.246981339187706e-07, + "loss": 1.0873, + "mean_token_accuracy": 0.6757100224494934, + "num_tokens": 24374872.0, + "step": 957 + }, + { + "epoch": 0.10520535910388755, + "grad_norm": 2.19059681892395, + "learning_rate": 5.252469813391877e-07, + "loss": 1.0353, + "mean_token_accuracy": 0.6893312931060791, + "num_tokens": 24402155.0, + "step": 958 + }, + { + "epoch": 0.10531517680650121, + "grad_norm": 2.443622350692749, + "learning_rate": 5.257958287596049e-07, + "loss": 1.0515, + "mean_token_accuracy": 0.6883721947669983, + "num_tokens": 24425986.0, + "step": 959 + }, + { + "epoch": 0.10542499450911487, + "grad_norm": 2.186133861541748, + "learning_rate": 5.263446761800219e-07, + "loss": 1.1192, + "mean_token_accuracy": 0.6696785092353821, + "num_tokens": 24452703.0, + "step": 960 + }, + { + "epoch": 0.10553481221172853, + "grad_norm": 2.200059413909912, + "learning_rate": 5.26893523600439e-07, + "loss": 1.0007, + "mean_token_accuracy": 0.7006029486656189, + "num_tokens": 24482320.0, + "step": 961 + }, + { + "epoch": 0.10564462991434219, + "grad_norm": 2.3098795413970947, + "learning_rate": 5.274423710208562e-07, + "loss": 1.0105, + "mean_token_accuracy": 0.7043132781982422, + "num_tokens": 24509146.0, + "step": 962 + }, + { + "epoch": 0.10575444761695585, + "grad_norm": 2.6013388633728027, + "learning_rate": 5.279912184412732e-07, + "loss": 1.0831, + "mean_token_accuracy": 0.6862649917602539, + "num_tokens": 24533681.0, + "step": 963 + }, + { + "epoch": 0.10586426531956951, + "grad_norm": 2.337937593460083, + "learning_rate": 5.285400658616904e-07, + "loss": 1.0814, + "mean_token_accuracy": 0.6793109178543091, + "num_tokens": 24563436.0, + "step": 964 + }, + { + "epoch": 0.10597408302218318, + "grad_norm": 2.363966464996338, + "learning_rate": 5.290889132821076e-07, + "loss": 1.0642, + "mean_token_accuracy": 0.6937485337257385, + "num_tokens": 24587454.0, + "step": 965 + }, + { + "epoch": 0.10608390072479684, + "grad_norm": 2.412407636642456, + "learning_rate": 5.296377607025246e-07, + "loss": 1.0237, + "mean_token_accuracy": 0.6981116533279419, + "num_tokens": 24609869.0, + "step": 966 + }, + { + "epoch": 0.1061937184274105, + "grad_norm": 2.0326757431030273, + "learning_rate": 5.301866081229418e-07, + "loss": 1.0644, + "mean_token_accuracy": 0.6795060634613037, + "num_tokens": 24640351.0, + "step": 967 + }, + { + "epoch": 0.10630353613002416, + "grad_norm": 2.317695140838623, + "learning_rate": 5.30735455543359e-07, + "loss": 1.0407, + "mean_token_accuracy": 0.6917308568954468, + "num_tokens": 24666447.0, + "step": 968 + }, + { + "epoch": 0.10641335383263782, + "grad_norm": 2.5765788555145264, + "learning_rate": 5.31284302963776e-07, + "loss": 1.027, + "mean_token_accuracy": 0.6945416927337646, + "num_tokens": 24687360.0, + "step": 969 + }, + { + "epoch": 0.10652317153525148, + "grad_norm": 2.3610291481018066, + "learning_rate": 5.318331503841932e-07, + "loss": 0.9346, + "mean_token_accuracy": 0.7202000617980957, + "num_tokens": 24712050.0, + "step": 970 + }, + { + "epoch": 0.10663298923786514, + "grad_norm": 2.2090749740600586, + "learning_rate": 5.323819978046103e-07, + "loss": 1.0341, + "mean_token_accuracy": 0.6913720369338989, + "num_tokens": 24741471.0, + "step": 971 + }, + { + "epoch": 0.1067428069404788, + "grad_norm": 2.5974326133728027, + "learning_rate": 5.329308452250274e-07, + "loss": 1.0418, + "mean_token_accuracy": 0.6955652236938477, + "num_tokens": 24763135.0, + "step": 972 + }, + { + "epoch": 0.10685262464309246, + "grad_norm": 2.094987392425537, + "learning_rate": 5.334796926454446e-07, + "loss": 1.0109, + "mean_token_accuracy": 0.70061194896698, + "num_tokens": 24792634.0, + "step": 973 + }, + { + "epoch": 0.10696244234570612, + "grad_norm": 2.1675844192504883, + "learning_rate": 5.340285400658617e-07, + "loss": 1.1036, + "mean_token_accuracy": 0.6816359758377075, + "num_tokens": 24820842.0, + "step": 974 + }, + { + "epoch": 0.1070722600483198, + "grad_norm": 2.5698137283325195, + "learning_rate": 5.345773874862788e-07, + "loss": 0.9955, + "mean_token_accuracy": 0.6940948367118835, + "num_tokens": 24841245.0, + "step": 975 + }, + { + "epoch": 0.10718207775093345, + "grad_norm": 2.5511868000030518, + "learning_rate": 5.35126234906696e-07, + "loss": 0.9666, + "mean_token_accuracy": 0.7048392295837402, + "num_tokens": 24862844.0, + "step": 976 + }, + { + "epoch": 0.10729189545354711, + "grad_norm": 2.193127393722534, + "learning_rate": 5.35675082327113e-07, + "loss": 1.0822, + "mean_token_accuracy": 0.6793603897094727, + "num_tokens": 24891415.0, + "step": 977 + }, + { + "epoch": 0.10740171315616077, + "grad_norm": 2.473113775253296, + "learning_rate": 5.362239297475302e-07, + "loss": 1.104, + "mean_token_accuracy": 0.6708807945251465, + "num_tokens": 24913622.0, + "step": 978 + }, + { + "epoch": 0.10751153085877443, + "grad_norm": 2.3866257667541504, + "learning_rate": 5.367727771679473e-07, + "loss": 1.0298, + "mean_token_accuracy": 0.6955675482749939, + "num_tokens": 24939748.0, + "step": 979 + }, + { + "epoch": 0.1076213485613881, + "grad_norm": 2.4140193462371826, + "learning_rate": 5.373216245883643e-07, + "loss": 0.9862, + "mean_token_accuracy": 0.7011673450469971, + "num_tokens": 24963653.0, + "step": 980 + }, + { + "epoch": 0.10773116626400175, + "grad_norm": 2.189281702041626, + "learning_rate": 5.378704720087815e-07, + "loss": 1.1359, + "mean_token_accuracy": 0.6745792627334595, + "num_tokens": 24994054.0, + "step": 981 + }, + { + "epoch": 0.10784098396661541, + "grad_norm": 2.053619861602783, + "learning_rate": 5.384193194291986e-07, + "loss": 1.0609, + "mean_token_accuracy": 0.694031834602356, + "num_tokens": 25023979.0, + "step": 982 + }, + { + "epoch": 0.10795080166922907, + "grad_norm": 2.2912092208862305, + "learning_rate": 5.389681668496157e-07, + "loss": 1.1647, + "mean_token_accuracy": 0.6657364368438721, + "num_tokens": 25053894.0, + "step": 983 + }, + { + "epoch": 0.10806061937184275, + "grad_norm": 2.1282668113708496, + "learning_rate": 5.395170142700329e-07, + "loss": 1.0697, + "mean_token_accuracy": 0.678442120552063, + "num_tokens": 25082810.0, + "step": 984 + }, + { + "epoch": 0.10817043707445641, + "grad_norm": 2.480417251586914, + "learning_rate": 5.4006586169045e-07, + "loss": 1.0355, + "mean_token_accuracy": 0.6968057155609131, + "num_tokens": 25107557.0, + "step": 985 + }, + { + "epoch": 0.10828025477707007, + "grad_norm": 2.5322582721710205, + "learning_rate": 5.406147091108671e-07, + "loss": 1.0727, + "mean_token_accuracy": 0.6811801195144653, + "num_tokens": 25130216.0, + "step": 986 + }, + { + "epoch": 0.10839007247968373, + "grad_norm": 2.5422801971435547, + "learning_rate": 5.411635565312843e-07, + "loss": 1.025, + "mean_token_accuracy": 0.6965072751045227, + "num_tokens": 25151570.0, + "step": 987 + }, + { + "epoch": 0.10849989018229739, + "grad_norm": 2.2153260707855225, + "learning_rate": 5.417124039517014e-07, + "loss": 1.1461, + "mean_token_accuracy": 0.6680699586868286, + "num_tokens": 25181295.0, + "step": 988 + }, + { + "epoch": 0.10860970788491105, + "grad_norm": 2.3333284854888916, + "learning_rate": 5.422612513721185e-07, + "loss": 1.0718, + "mean_token_accuracy": 0.6849548816680908, + "num_tokens": 25207948.0, + "step": 989 + }, + { + "epoch": 0.1087195255875247, + "grad_norm": 2.2908833026885986, + "learning_rate": 5.428100987925357e-07, + "loss": 1.0439, + "mean_token_accuracy": 0.6920095682144165, + "num_tokens": 25233913.0, + "step": 990 + }, + { + "epoch": 0.10882934329013837, + "grad_norm": 2.4497244358062744, + "learning_rate": 5.433589462129528e-07, + "loss": 1.0551, + "mean_token_accuracy": 0.7014879584312439, + "num_tokens": 25257310.0, + "step": 991 + }, + { + "epoch": 0.10893916099275203, + "grad_norm": 2.33957839012146, + "learning_rate": 5.439077936333699e-07, + "loss": 1.0045, + "mean_token_accuracy": 0.6973641514778137, + "num_tokens": 25282768.0, + "step": 992 + }, + { + "epoch": 0.10904897869536569, + "grad_norm": 2.7814722061157227, + "learning_rate": 5.44456641053787e-07, + "loss": 1.0099, + "mean_token_accuracy": 0.6977499723434448, + "num_tokens": 25301996.0, + "step": 993 + }, + { + "epoch": 0.10915879639797936, + "grad_norm": 2.0467545986175537, + "learning_rate": 5.450054884742042e-07, + "loss": 1.131, + "mean_token_accuracy": 0.6620352864265442, + "num_tokens": 25334818.0, + "step": 994 + }, + { + "epoch": 0.10926861410059302, + "grad_norm": 2.533472776412964, + "learning_rate": 5.455543358946213e-07, + "loss": 0.9985, + "mean_token_accuracy": 0.7034075260162354, + "num_tokens": 25357164.0, + "step": 995 + }, + { + "epoch": 0.10937843180320668, + "grad_norm": 2.8127124309539795, + "learning_rate": 5.461031833150384e-07, + "loss": 0.9833, + "mean_token_accuracy": 0.704168975353241, + "num_tokens": 25376052.0, + "step": 996 + }, + { + "epoch": 0.10948824950582034, + "grad_norm": 2.433885335922241, + "learning_rate": 5.466520307354556e-07, + "loss": 1.053, + "mean_token_accuracy": 0.6869451403617859, + "num_tokens": 25400357.0, + "step": 997 + }, + { + "epoch": 0.109598067208434, + "grad_norm": 2.4866228103637695, + "learning_rate": 5.472008781558726e-07, + "loss": 1.0352, + "mean_token_accuracy": 0.6945279836654663, + "num_tokens": 25422894.0, + "step": 998 + }, + { + "epoch": 0.10970788491104766, + "grad_norm": 2.444207191467285, + "learning_rate": 5.477497255762897e-07, + "loss": 1.1723, + "mean_token_accuracy": 0.6702303886413574, + "num_tokens": 25450195.0, + "step": 999 + }, + { + "epoch": 0.10981770261366132, + "grad_norm": 2.107001543045044, + "learning_rate": 5.482985729967069e-07, + "loss": 0.9855, + "mean_token_accuracy": 0.7020922899246216, + "num_tokens": 25477637.0, + "step": 1000 + }, + { + "epoch": 0.10992752031627498, + "grad_norm": 2.376643180847168, + "learning_rate": 5.48847420417124e-07, + "loss": 1.0435, + "mean_token_accuracy": 0.6851418018341064, + "num_tokens": 25503847.0, + "step": 1001 + }, + { + "epoch": 0.11003733801888864, + "grad_norm": 2.1415696144104004, + "learning_rate": 5.493962678375411e-07, + "loss": 1.1575, + "mean_token_accuracy": 0.6636366844177246, + "num_tokens": 25534079.0, + "step": 1002 + }, + { + "epoch": 0.11014715572150231, + "grad_norm": 2.300351142883301, + "learning_rate": 5.499451152579583e-07, + "loss": 1.0928, + "mean_token_accuracy": 0.6810222864151001, + "num_tokens": 25561784.0, + "step": 1003 + }, + { + "epoch": 0.11025697342411597, + "grad_norm": 2.218785285949707, + "learning_rate": 5.504939626783753e-07, + "loss": 1.0137, + "mean_token_accuracy": 0.6937829256057739, + "num_tokens": 25588812.0, + "step": 1004 + }, + { + "epoch": 0.11036679112672963, + "grad_norm": 2.3855838775634766, + "learning_rate": 5.510428100987925e-07, + "loss": 1.1561, + "mean_token_accuracy": 0.6564309000968933, + "num_tokens": 25614080.0, + "step": 1005 + }, + { + "epoch": 0.11047660882934329, + "grad_norm": 2.766767740249634, + "learning_rate": 5.515916575192097e-07, + "loss": 0.9926, + "mean_token_accuracy": 0.7018936276435852, + "num_tokens": 25632217.0, + "step": 1006 + }, + { + "epoch": 0.11058642653195695, + "grad_norm": 2.3757500648498535, + "learning_rate": 5.521405049396267e-07, + "loss": 1.0755, + "mean_token_accuracy": 0.6837347745895386, + "num_tokens": 25657597.0, + "step": 1007 + }, + { + "epoch": 0.11069624423457061, + "grad_norm": 2.371436357498169, + "learning_rate": 5.526893523600439e-07, + "loss": 0.966, + "mean_token_accuracy": 0.7073335647583008, + "num_tokens": 25680895.0, + "step": 1008 + }, + { + "epoch": 0.11080606193718427, + "grad_norm": 2.8156237602233887, + "learning_rate": 5.532381997804611e-07, + "loss": 0.9452, + "mean_token_accuracy": 0.7116729617118835, + "num_tokens": 25698588.0, + "step": 1009 + }, + { + "epoch": 0.11091587963979793, + "grad_norm": 2.372734785079956, + "learning_rate": 5.537870472008781e-07, + "loss": 1.0894, + "mean_token_accuracy": 0.6770235300064087, + "num_tokens": 25723460.0, + "step": 1010 + }, + { + "epoch": 0.11102569734241159, + "grad_norm": 2.631155490875244, + "learning_rate": 5.543358946212953e-07, + "loss": 0.9959, + "mean_token_accuracy": 0.7032966613769531, + "num_tokens": 25743647.0, + "step": 1011 + }, + { + "epoch": 0.11113551504502525, + "grad_norm": 2.331270694732666, + "learning_rate": 5.548847420417125e-07, + "loss": 1.0979, + "mean_token_accuracy": 0.6714740991592407, + "num_tokens": 25770311.0, + "step": 1012 + }, + { + "epoch": 0.11124533274763893, + "grad_norm": 2.046708822250366, + "learning_rate": 5.554335894621295e-07, + "loss": 1.0352, + "mean_token_accuracy": 0.6940470933914185, + "num_tokens": 25802358.0, + "step": 1013 + }, + { + "epoch": 0.11135515045025259, + "grad_norm": 2.1885030269622803, + "learning_rate": 5.559824368825467e-07, + "loss": 1.1278, + "mean_token_accuracy": 0.6703910231590271, + "num_tokens": 25831860.0, + "step": 1014 + }, + { + "epoch": 0.11146496815286625, + "grad_norm": 2.248755931854248, + "learning_rate": 5.565312843029637e-07, + "loss": 1.0736, + "mean_token_accuracy": 0.6892768144607544, + "num_tokens": 25858605.0, + "step": 1015 + }, + { + "epoch": 0.1115747858554799, + "grad_norm": 2.6425445079803467, + "learning_rate": 5.570801317233809e-07, + "loss": 0.9717, + "mean_token_accuracy": 0.7076042294502258, + "num_tokens": 25877847.0, + "step": 1016 + }, + { + "epoch": 0.11168460355809356, + "grad_norm": 2.323549509048462, + "learning_rate": 5.57628979143798e-07, + "loss": 1.0601, + "mean_token_accuracy": 0.6858867406845093, + "num_tokens": 25904550.0, + "step": 1017 + }, + { + "epoch": 0.11179442126070722, + "grad_norm": 2.392167329788208, + "learning_rate": 5.58177826564215e-07, + "loss": 0.9949, + "mean_token_accuracy": 0.694111168384552, + "num_tokens": 25928322.0, + "step": 1018 + }, + { + "epoch": 0.11190423896332088, + "grad_norm": 2.3708505630493164, + "learning_rate": 5.587266739846322e-07, + "loss": 0.9906, + "mean_token_accuracy": 0.7010446786880493, + "num_tokens": 25952614.0, + "step": 1019 + }, + { + "epoch": 0.11201405666593454, + "grad_norm": 2.1956517696380615, + "learning_rate": 5.592755214050494e-07, + "loss": 1.1249, + "mean_token_accuracy": 0.6676257252693176, + "num_tokens": 25981425.0, + "step": 1020 + }, + { + "epoch": 0.1121238743685482, + "grad_norm": 2.392425775527954, + "learning_rate": 5.598243688254664e-07, + "loss": 1.0809, + "mean_token_accuracy": 0.68547523021698, + "num_tokens": 26008636.0, + "step": 1021 + }, + { + "epoch": 0.11223369207116188, + "grad_norm": 2.7141621112823486, + "learning_rate": 5.603732162458836e-07, + "loss": 1.0349, + "mean_token_accuracy": 0.6910250186920166, + "num_tokens": 26028987.0, + "step": 1022 + }, + { + "epoch": 0.11234350977377554, + "grad_norm": 2.1722748279571533, + "learning_rate": 5.609220636663008e-07, + "loss": 0.9643, + "mean_token_accuracy": 0.7101444005966187, + "num_tokens": 26055616.0, + "step": 1023 + }, + { + "epoch": 0.1124533274763892, + "grad_norm": 2.323166847229004, + "learning_rate": 5.614709110867178e-07, + "loss": 1.0809, + "mean_token_accuracy": 0.676719605922699, + "num_tokens": 26081355.0, + "step": 1024 + }, + { + "epoch": 0.11256314517900286, + "grad_norm": 2.457380533218384, + "learning_rate": 5.62019758507135e-07, + "loss": 1.0291, + "mean_token_accuracy": 0.6913741827011108, + "num_tokens": 26106379.0, + "step": 1025 + }, + { + "epoch": 0.11267296288161652, + "grad_norm": 2.2688748836517334, + "learning_rate": 5.625686059275521e-07, + "loss": 1.0893, + "mean_token_accuracy": 0.6771997213363647, + "num_tokens": 26133647.0, + "step": 1026 + }, + { + "epoch": 0.11278278058423018, + "grad_norm": 2.425288438796997, + "learning_rate": 5.631174533479692e-07, + "loss": 1.0951, + "mean_token_accuracy": 0.682188868522644, + "num_tokens": 26157508.0, + "step": 1027 + }, + { + "epoch": 0.11289259828684384, + "grad_norm": 2.5006635189056396, + "learning_rate": 5.636663007683864e-07, + "loss": 1.1225, + "mean_token_accuracy": 0.6661000847816467, + "num_tokens": 26180348.0, + "step": 1028 + }, + { + "epoch": 0.1130024159894575, + "grad_norm": 2.320528030395508, + "learning_rate": 5.642151481888035e-07, + "loss": 1.1775, + "mean_token_accuracy": 0.656294047832489, + "num_tokens": 26208838.0, + "step": 1029 + }, + { + "epoch": 0.11311223369207116, + "grad_norm": 2.229346990585327, + "learning_rate": 5.647639956092206e-07, + "loss": 1.0878, + "mean_token_accuracy": 0.6764308214187622, + "num_tokens": 26236130.0, + "step": 1030 + }, + { + "epoch": 0.11322205139468482, + "grad_norm": 2.486908197402954, + "learning_rate": 5.653128430296378e-07, + "loss": 1.0328, + "mean_token_accuracy": 0.6993697285652161, + "num_tokens": 26258668.0, + "step": 1031 + }, + { + "epoch": 0.11333186909729849, + "grad_norm": 2.332984447479248, + "learning_rate": 5.658616904500549e-07, + "loss": 1.0804, + "mean_token_accuracy": 0.6779107451438904, + "num_tokens": 26285712.0, + "step": 1032 + }, + { + "epoch": 0.11344168679991215, + "grad_norm": 2.1597015857696533, + "learning_rate": 5.66410537870472e-07, + "loss": 1.0995, + "mean_token_accuracy": 0.6836291551589966, + "num_tokens": 26315248.0, + "step": 1033 + }, + { + "epoch": 0.11355150450252581, + "grad_norm": 2.1456947326660156, + "learning_rate": 5.669593852908892e-07, + "loss": 1.1476, + "mean_token_accuracy": 0.6627589464187622, + "num_tokens": 26344142.0, + "step": 1034 + }, + { + "epoch": 0.11366132220513947, + "grad_norm": 2.390228271484375, + "learning_rate": 5.675082327113063e-07, + "loss": 1.0109, + "mean_token_accuracy": 0.6961274147033691, + "num_tokens": 26367948.0, + "step": 1035 + }, + { + "epoch": 0.11377113990775313, + "grad_norm": 2.678741455078125, + "learning_rate": 5.680570801317233e-07, + "loss": 1.0706, + "mean_token_accuracy": 0.6943035125732422, + "num_tokens": 26388913.0, + "step": 1036 + }, + { + "epoch": 0.11388095761036679, + "grad_norm": 2.337322950363159, + "learning_rate": 5.686059275521404e-07, + "loss": 1.0329, + "mean_token_accuracy": 0.6933689117431641, + "num_tokens": 26414395.0, + "step": 1037 + }, + { + "epoch": 0.11399077531298045, + "grad_norm": 2.5290615558624268, + "learning_rate": 5.691547749725576e-07, + "loss": 1.0362, + "mean_token_accuracy": 0.6924080848693848, + "num_tokens": 26437469.0, + "step": 1038 + }, + { + "epoch": 0.11410059301559411, + "grad_norm": 2.4235308170318604, + "learning_rate": 5.697036223929747e-07, + "loss": 1.1571, + "mean_token_accuracy": 0.6624491810798645, + "num_tokens": 26462992.0, + "step": 1039 + }, + { + "epoch": 0.11421041071820777, + "grad_norm": 2.292872905731201, + "learning_rate": 5.702524698133918e-07, + "loss": 1.0073, + "mean_token_accuracy": 0.7004631757736206, + "num_tokens": 26487919.0, + "step": 1040 + }, + { + "epoch": 0.11432022842082144, + "grad_norm": 2.119293451309204, + "learning_rate": 5.70801317233809e-07, + "loss": 1.1711, + "mean_token_accuracy": 0.6495764851570129, + "num_tokens": 26521241.0, + "step": 1041 + }, + { + "epoch": 0.1144300461234351, + "grad_norm": 2.2553343772888184, + "learning_rate": 5.713501646542261e-07, + "loss": 1.0946, + "mean_token_accuracy": 0.6711209416389465, + "num_tokens": 26549161.0, + "step": 1042 + }, + { + "epoch": 0.11453986382604876, + "grad_norm": 2.0300028324127197, + "learning_rate": 5.718990120746432e-07, + "loss": 1.0859, + "mean_token_accuracy": 0.6765882968902588, + "num_tokens": 26580769.0, + "step": 1043 + }, + { + "epoch": 0.11464968152866242, + "grad_norm": 2.756662130355835, + "learning_rate": 5.724478594950604e-07, + "loss": 0.9453, + "mean_token_accuracy": 0.7106956243515015, + "num_tokens": 26599253.0, + "step": 1044 + }, + { + "epoch": 0.11475949923127608, + "grad_norm": 2.4854488372802734, + "learning_rate": 5.729967069154775e-07, + "loss": 1.0998, + "mean_token_accuracy": 0.6752268671989441, + "num_tokens": 26623497.0, + "step": 1045 + }, + { + "epoch": 0.11486931693388974, + "grad_norm": 2.3241920471191406, + "learning_rate": 5.735455543358946e-07, + "loss": 0.9989, + "mean_token_accuracy": 0.7004987001419067, + "num_tokens": 26648196.0, + "step": 1046 + }, + { + "epoch": 0.1149791346365034, + "grad_norm": 2.2265586853027344, + "learning_rate": 5.740944017563118e-07, + "loss": 1.0243, + "mean_token_accuracy": 0.6887692213058472, + "num_tokens": 26675356.0, + "step": 1047 + }, + { + "epoch": 0.11508895233911706, + "grad_norm": 2.115950107574463, + "learning_rate": 5.746432491767288e-07, + "loss": 1.1328, + "mean_token_accuracy": 0.6626917123794556, + "num_tokens": 26704588.0, + "step": 1048 + }, + { + "epoch": 0.11519877004173072, + "grad_norm": 2.198216199874878, + "learning_rate": 5.75192096597146e-07, + "loss": 1.0962, + "mean_token_accuracy": 0.6729657053947449, + "num_tokens": 26734112.0, + "step": 1049 + }, + { + "epoch": 0.11530858774434438, + "grad_norm": 2.3398988246917725, + "learning_rate": 5.757409440175632e-07, + "loss": 1.0233, + "mean_token_accuracy": 0.6904132962226868, + "num_tokens": 26760497.0, + "step": 1050 + }, + { + "epoch": 0.11541840544695806, + "grad_norm": 2.35239839553833, + "learning_rate": 5.762897914379802e-07, + "loss": 1.0589, + "mean_token_accuracy": 0.6952887773513794, + "num_tokens": 26786420.0, + "step": 1051 + }, + { + "epoch": 0.11552822314957172, + "grad_norm": 2.4576234817504883, + "learning_rate": 5.768386388583974e-07, + "loss": 1.0049, + "mean_token_accuracy": 0.7017413973808289, + "num_tokens": 26811648.0, + "step": 1052 + }, + { + "epoch": 0.11563804085218538, + "grad_norm": 2.4058659076690674, + "learning_rate": 5.773874862788145e-07, + "loss": 1.1561, + "mean_token_accuracy": 0.6665551662445068, + "num_tokens": 26838148.0, + "step": 1053 + }, + { + "epoch": 0.11574785855479904, + "grad_norm": 2.179314136505127, + "learning_rate": 5.779363336992316e-07, + "loss": 1.1106, + "mean_token_accuracy": 0.6633813381195068, + "num_tokens": 26867645.0, + "step": 1054 + }, + { + "epoch": 0.1158576762574127, + "grad_norm": 2.2117278575897217, + "learning_rate": 5.784851811196487e-07, + "loss": 1.1045, + "mean_token_accuracy": 0.6701090931892395, + "num_tokens": 26897331.0, + "step": 1055 + }, + { + "epoch": 0.11596749396002635, + "grad_norm": 1.9570215940475464, + "learning_rate": 5.790340285400658e-07, + "loss": 1.0736, + "mean_token_accuracy": 0.6879056692123413, + "num_tokens": 26931241.0, + "step": 1056 + }, + { + "epoch": 0.11607731166264001, + "grad_norm": 2.5654306411743164, + "learning_rate": 5.795828759604829e-07, + "loss": 1.1828, + "mean_token_accuracy": 0.6565667986869812, + "num_tokens": 26953115.0, + "step": 1057 + }, + { + "epoch": 0.11618712936525367, + "grad_norm": 2.305159091949463, + "learning_rate": 5.801317233809001e-07, + "loss": 1.0952, + "mean_token_accuracy": 0.6932497024536133, + "num_tokens": 26979186.0, + "step": 1058 + }, + { + "epoch": 0.11629694706786733, + "grad_norm": 2.684401273727417, + "learning_rate": 5.806805708013171e-07, + "loss": 1.0511, + "mean_token_accuracy": 0.6881232261657715, + "num_tokens": 27002141.0, + "step": 1059 + }, + { + "epoch": 0.11640676477048101, + "grad_norm": 2.3901026248931885, + "learning_rate": 5.812294182217343e-07, + "loss": 1.0514, + "mean_token_accuracy": 0.6898725032806396, + "num_tokens": 27028251.0, + "step": 1060 + }, + { + "epoch": 0.11651658247309467, + "grad_norm": 2.3540334701538086, + "learning_rate": 5.817782656421515e-07, + "loss": 1.0629, + "mean_token_accuracy": 0.6872270107269287, + "num_tokens": 27053280.0, + "step": 1061 + }, + { + "epoch": 0.11662640017570833, + "grad_norm": 2.6033778190612793, + "learning_rate": 5.823271130625685e-07, + "loss": 1.0436, + "mean_token_accuracy": 0.6912847757339478, + "num_tokens": 27074797.0, + "step": 1062 + }, + { + "epoch": 0.11673621787832199, + "grad_norm": 2.409771203994751, + "learning_rate": 5.828759604829857e-07, + "loss": 1.0301, + "mean_token_accuracy": 0.6933093667030334, + "num_tokens": 27100047.0, + "step": 1063 + }, + { + "epoch": 0.11684603558093565, + "grad_norm": 2.437502145767212, + "learning_rate": 5.834248079034029e-07, + "loss": 1.0655, + "mean_token_accuracy": 0.6860567331314087, + "num_tokens": 27124683.0, + "step": 1064 + }, + { + "epoch": 0.11695585328354931, + "grad_norm": 2.146801471710205, + "learning_rate": 5.839736553238199e-07, + "loss": 1.1522, + "mean_token_accuracy": 0.6759325265884399, + "num_tokens": 27155623.0, + "step": 1065 + }, + { + "epoch": 0.11706567098616297, + "grad_norm": 2.1619019508361816, + "learning_rate": 5.845225027442371e-07, + "loss": 1.1531, + "mean_token_accuracy": 0.6587933897972107, + "num_tokens": 27185728.0, + "step": 1066 + }, + { + "epoch": 0.11717548868877663, + "grad_norm": 2.620591640472412, + "learning_rate": 5.850713501646543e-07, + "loss": 1.007, + "mean_token_accuracy": 0.6957113146781921, + "num_tokens": 27208167.0, + "step": 1067 + }, + { + "epoch": 0.11728530639139029, + "grad_norm": 2.1338367462158203, + "learning_rate": 5.856201975850713e-07, + "loss": 1.1004, + "mean_token_accuracy": 0.6791257858276367, + "num_tokens": 27237673.0, + "step": 1068 + }, + { + "epoch": 0.11739512409400395, + "grad_norm": 2.0713627338409424, + "learning_rate": 5.861690450054885e-07, + "loss": 1.1603, + "mean_token_accuracy": 0.6577295660972595, + "num_tokens": 27270020.0, + "step": 1069 + }, + { + "epoch": 0.11750494179661762, + "grad_norm": 2.2609686851501465, + "learning_rate": 5.867178924259056e-07, + "loss": 1.0403, + "mean_token_accuracy": 0.6939022541046143, + "num_tokens": 27296219.0, + "step": 1070 + }, + { + "epoch": 0.11761475949923128, + "grad_norm": 2.5570483207702637, + "learning_rate": 5.872667398463227e-07, + "loss": 1.007, + "mean_token_accuracy": 0.6953150033950806, + "num_tokens": 27316795.0, + "step": 1071 + }, + { + "epoch": 0.11772457720184494, + "grad_norm": 2.157074451446533, + "learning_rate": 5.878155872667399e-07, + "loss": 1.0828, + "mean_token_accuracy": 0.6792920231819153, + "num_tokens": 27345797.0, + "step": 1072 + }, + { + "epoch": 0.1178343949044586, + "grad_norm": 2.6744980812072754, + "learning_rate": 5.88364434687157e-07, + "loss": 1.0462, + "mean_token_accuracy": 0.6893579959869385, + "num_tokens": 27366687.0, + "step": 1073 + }, + { + "epoch": 0.11794421260707226, + "grad_norm": 2.5698344707489014, + "learning_rate": 5.88913282107574e-07, + "loss": 1.1279, + "mean_token_accuracy": 0.6844549179077148, + "num_tokens": 27389819.0, + "step": 1074 + }, + { + "epoch": 0.11805403030968592, + "grad_norm": 2.2385804653167725, + "learning_rate": 5.894621295279912e-07, + "loss": 1.038, + "mean_token_accuracy": 0.6912588477134705, + "num_tokens": 27417647.0, + "step": 1075 + }, + { + "epoch": 0.11816384801229958, + "grad_norm": 2.1603171825408936, + "learning_rate": 5.900109769484083e-07, + "loss": 1.0641, + "mean_token_accuracy": 0.6801882982254028, + "num_tokens": 27445566.0, + "step": 1076 + }, + { + "epoch": 0.11827366571491324, + "grad_norm": 2.359267473220825, + "learning_rate": 5.905598243688254e-07, + "loss": 1.0679, + "mean_token_accuracy": 0.6781899929046631, + "num_tokens": 27471484.0, + "step": 1077 + }, + { + "epoch": 0.1183834834175269, + "grad_norm": 2.60943603515625, + "learning_rate": 5.911086717892426e-07, + "loss": 1.1548, + "mean_token_accuracy": 0.6669136881828308, + "num_tokens": 27494898.0, + "step": 1078 + }, + { + "epoch": 0.11849330112014057, + "grad_norm": 2.446368932723999, + "learning_rate": 5.916575192096597e-07, + "loss": 1.1503, + "mean_token_accuracy": 0.6606053113937378, + "num_tokens": 27519771.0, + "step": 1079 + }, + { + "epoch": 0.11860311882275423, + "grad_norm": 2.05033540725708, + "learning_rate": 5.922063666300768e-07, + "loss": 1.0827, + "mean_token_accuracy": 0.6761536002159119, + "num_tokens": 27552251.0, + "step": 1080 + }, + { + "epoch": 0.1187129365253679, + "grad_norm": 2.2880501747131348, + "learning_rate": 5.927552140504939e-07, + "loss": 1.0189, + "mean_token_accuracy": 0.698996365070343, + "num_tokens": 27577387.0, + "step": 1081 + }, + { + "epoch": 0.11882275422798155, + "grad_norm": 2.5927975177764893, + "learning_rate": 5.93304061470911e-07, + "loss": 1.0626, + "mean_token_accuracy": 0.6831701397895813, + "num_tokens": 27600377.0, + "step": 1082 + }, + { + "epoch": 0.11893257193059521, + "grad_norm": 2.4408252239227295, + "learning_rate": 5.938529088913282e-07, + "loss": 1.0639, + "mean_token_accuracy": 0.6877090930938721, + "num_tokens": 27624702.0, + "step": 1083 + }, + { + "epoch": 0.11904238963320887, + "grad_norm": 2.285721778869629, + "learning_rate": 5.944017563117453e-07, + "loss": 1.0964, + "mean_token_accuracy": 0.6777188777923584, + "num_tokens": 27652029.0, + "step": 1084 + }, + { + "epoch": 0.11915220733582253, + "grad_norm": 2.527245044708252, + "learning_rate": 5.949506037321624e-07, + "loss": 1.1389, + "mean_token_accuracy": 0.6733154654502869, + "num_tokens": 27676293.0, + "step": 1085 + }, + { + "epoch": 0.11926202503843619, + "grad_norm": 2.4449424743652344, + "learning_rate": 5.954994511525796e-07, + "loss": 1.0684, + "mean_token_accuracy": 0.6943759918212891, + "num_tokens": 27700370.0, + "step": 1086 + }, + { + "epoch": 0.11937184274104985, + "grad_norm": 2.2381691932678223, + "learning_rate": 5.960482985729967e-07, + "loss": 1.1314, + "mean_token_accuracy": 0.6640946865081787, + "num_tokens": 27727690.0, + "step": 1087 + }, + { + "epoch": 0.11948166044366351, + "grad_norm": 2.3169941902160645, + "learning_rate": 5.965971459934138e-07, + "loss": 1.0888, + "mean_token_accuracy": 0.674336314201355, + "num_tokens": 27754248.0, + "step": 1088 + }, + { + "epoch": 0.11959147814627719, + "grad_norm": 2.6255948543548584, + "learning_rate": 5.971459934138309e-07, + "loss": 0.985, + "mean_token_accuracy": 0.7016685009002686, + "num_tokens": 27775925.0, + "step": 1089 + }, + { + "epoch": 0.11970129584889085, + "grad_norm": 2.2130613327026367, + "learning_rate": 5.976948408342481e-07, + "loss": 1.0845, + "mean_token_accuracy": 0.6774738430976868, + "num_tokens": 27805079.0, + "step": 1090 + }, + { + "epoch": 0.1198111135515045, + "grad_norm": 2.519319534301758, + "learning_rate": 5.982436882546652e-07, + "loss": 0.9929, + "mean_token_accuracy": 0.7000747919082642, + "num_tokens": 27827954.0, + "step": 1091 + }, + { + "epoch": 0.11992093125411817, + "grad_norm": 2.296020984649658, + "learning_rate": 5.987925356750822e-07, + "loss": 1.1266, + "mean_token_accuracy": 0.6678847074508667, + "num_tokens": 27857114.0, + "step": 1092 + }, + { + "epoch": 0.12003074895673183, + "grad_norm": 2.339991331100464, + "learning_rate": 5.993413830954994e-07, + "loss": 1.0238, + "mean_token_accuracy": 0.697765588760376, + "num_tokens": 27883477.0, + "step": 1093 + }, + { + "epoch": 0.12014056665934549, + "grad_norm": 2.3065428733825684, + "learning_rate": 5.998902305159165e-07, + "loss": 1.0631, + "mean_token_accuracy": 0.6921088695526123, + "num_tokens": 27907035.0, + "step": 1094 + }, + { + "epoch": 0.12025038436195915, + "grad_norm": 2.181490421295166, + "learning_rate": 6.004390779363336e-07, + "loss": 1.0789, + "mean_token_accuracy": 0.6829729080200195, + "num_tokens": 27937253.0, + "step": 1095 + }, + { + "epoch": 0.1203602020645728, + "grad_norm": 2.533970832824707, + "learning_rate": 6.009879253567508e-07, + "loss": 1.0435, + "mean_token_accuracy": 0.6969186663627625, + "num_tokens": 27961172.0, + "step": 1096 + }, + { + "epoch": 0.12047001976718646, + "grad_norm": 2.397148609161377, + "learning_rate": 6.015367727771679e-07, + "loss": 1.0715, + "mean_token_accuracy": 0.6936707496643066, + "num_tokens": 27986042.0, + "step": 1097 + }, + { + "epoch": 0.12057983746980014, + "grad_norm": 2.539473056793213, + "learning_rate": 6.02085620197585e-07, + "loss": 1.0098, + "mean_token_accuracy": 0.7008217573165894, + "num_tokens": 28008263.0, + "step": 1098 + }, + { + "epoch": 0.1206896551724138, + "grad_norm": 2.3761491775512695, + "learning_rate": 6.026344676180022e-07, + "loss": 1.0846, + "mean_token_accuracy": 0.6856206655502319, + "num_tokens": 28033798.0, + "step": 1099 + }, + { + "epoch": 0.12079947287502746, + "grad_norm": 2.2895359992980957, + "learning_rate": 6.031833150384192e-07, + "loss": 1.0671, + "mean_token_accuracy": 0.6885620355606079, + "num_tokens": 28060034.0, + "step": 1100 + }, + { + "epoch": 0.12090929057764112, + "grad_norm": 2.4105234146118164, + "learning_rate": 6.037321624588364e-07, + "loss": 1.1372, + "mean_token_accuracy": 0.6695619821548462, + "num_tokens": 28085798.0, + "step": 1101 + }, + { + "epoch": 0.12101910828025478, + "grad_norm": 2.304899215698242, + "learning_rate": 6.042810098792536e-07, + "loss": 1.0786, + "mean_token_accuracy": 0.6830928325653076, + "num_tokens": 28112436.0, + "step": 1102 + }, + { + "epoch": 0.12112892598286844, + "grad_norm": 2.7192583084106445, + "learning_rate": 6.048298572996706e-07, + "loss": 0.9988, + "mean_token_accuracy": 0.7012512683868408, + "num_tokens": 28132511.0, + "step": 1103 + }, + { + "epoch": 0.1212387436854821, + "grad_norm": 2.4972078800201416, + "learning_rate": 6.053787047200878e-07, + "loss": 1.0219, + "mean_token_accuracy": 0.6960514783859253, + "num_tokens": 28154831.0, + "step": 1104 + }, + { + "epoch": 0.12134856138809576, + "grad_norm": 2.668452262878418, + "learning_rate": 6.05927552140505e-07, + "loss": 0.9547, + "mean_token_accuracy": 0.7206794619560242, + "num_tokens": 28174031.0, + "step": 1105 + }, + { + "epoch": 0.12145837909070942, + "grad_norm": 2.346914052963257, + "learning_rate": 6.06476399560922e-07, + "loss": 1.0554, + "mean_token_accuracy": 0.6877864003181458, + "num_tokens": 28198421.0, + "step": 1106 + }, + { + "epoch": 0.12156819679332308, + "grad_norm": 2.6150808334350586, + "learning_rate": 6.070252469813392e-07, + "loss": 1.0552, + "mean_token_accuracy": 0.6800119280815125, + "num_tokens": 28221248.0, + "step": 1107 + }, + { + "epoch": 0.12167801449593675, + "grad_norm": 2.464836835861206, + "learning_rate": 6.075740944017564e-07, + "loss": 1.0942, + "mean_token_accuracy": 0.684337854385376, + "num_tokens": 28246696.0, + "step": 1108 + }, + { + "epoch": 0.12178783219855041, + "grad_norm": 2.3194143772125244, + "learning_rate": 6.081229418221734e-07, + "loss": 1.0526, + "mean_token_accuracy": 0.6851503849029541, + "num_tokens": 28272172.0, + "step": 1109 + }, + { + "epoch": 0.12189764990116407, + "grad_norm": 2.49312686920166, + "learning_rate": 6.086717892425906e-07, + "loss": 1.0736, + "mean_token_accuracy": 0.6845158934593201, + "num_tokens": 28298527.0, + "step": 1110 + }, + { + "epoch": 0.12200746760377773, + "grad_norm": 2.4180471897125244, + "learning_rate": 6.092206366630076e-07, + "loss": 0.9983, + "mean_token_accuracy": 0.7099671363830566, + "num_tokens": 28320196.0, + "step": 1111 + }, + { + "epoch": 0.12211728530639139, + "grad_norm": 2.3975136280059814, + "learning_rate": 6.097694840834247e-07, + "loss": 1.0123, + "mean_token_accuracy": 0.6922063827514648, + "num_tokens": 28343402.0, + "step": 1112 + }, + { + "epoch": 0.12222710300900505, + "grad_norm": 2.244877815246582, + "learning_rate": 6.103183315038419e-07, + "loss": 0.9781, + "mean_token_accuracy": 0.7123417258262634, + "num_tokens": 28368062.0, + "step": 1113 + }, + { + "epoch": 0.12233692071161871, + "grad_norm": 2.3734002113342285, + "learning_rate": 6.10867178924259e-07, + "loss": 1.1012, + "mean_token_accuracy": 0.6756907105445862, + "num_tokens": 28394290.0, + "step": 1114 + }, + { + "epoch": 0.12244673841423237, + "grad_norm": 2.554344892501831, + "learning_rate": 6.114160263446761e-07, + "loss": 0.977, + "mean_token_accuracy": 0.7066525816917419, + "num_tokens": 28415364.0, + "step": 1115 + }, + { + "epoch": 0.12255655611684603, + "grad_norm": 2.2314071655273438, + "learning_rate": 6.119648737650933e-07, + "loss": 1.0278, + "mean_token_accuracy": 0.6964263319969177, + "num_tokens": 28441964.0, + "step": 1116 + }, + { + "epoch": 0.1226663738194597, + "grad_norm": 2.4381747245788574, + "learning_rate": 6.125137211855103e-07, + "loss": 1.0034, + "mean_token_accuracy": 0.6925556659698486, + "num_tokens": 28465120.0, + "step": 1117 + }, + { + "epoch": 0.12277619152207336, + "grad_norm": 2.6955201625823975, + "learning_rate": 6.130625686059275e-07, + "loss": 1.1209, + "mean_token_accuracy": 0.669373095035553, + "num_tokens": 28486653.0, + "step": 1118 + }, + { + "epoch": 0.12288600922468702, + "grad_norm": 2.252506732940674, + "learning_rate": 6.136114160263447e-07, + "loss": 0.9845, + "mean_token_accuracy": 0.7031023502349854, + "num_tokens": 28511497.0, + "step": 1119 + }, + { + "epoch": 0.12299582692730068, + "grad_norm": 2.239086151123047, + "learning_rate": 6.141602634467617e-07, + "loss": 1.0299, + "mean_token_accuracy": 0.6933830976486206, + "num_tokens": 28537913.0, + "step": 1120 + }, + { + "epoch": 0.12310564462991434, + "grad_norm": 2.148280382156372, + "learning_rate": 6.147091108671789e-07, + "loss": 1.0872, + "mean_token_accuracy": 0.6786551475524902, + "num_tokens": 28565736.0, + "step": 1121 + }, + { + "epoch": 0.123215462332528, + "grad_norm": 2.4798600673675537, + "learning_rate": 6.15257958287596e-07, + "loss": 1.0338, + "mean_token_accuracy": 0.6899862289428711, + "num_tokens": 28589319.0, + "step": 1122 + }, + { + "epoch": 0.12332528003514166, + "grad_norm": 2.5925536155700684, + "learning_rate": 6.158068057080131e-07, + "loss": 1.0559, + "mean_token_accuracy": 0.6782178282737732, + "num_tokens": 28612260.0, + "step": 1123 + }, + { + "epoch": 0.12343509773775532, + "grad_norm": 2.373661518096924, + "learning_rate": 6.163556531284303e-07, + "loss": 1.0376, + "mean_token_accuracy": 0.6846655607223511, + "num_tokens": 28636998.0, + "step": 1124 + }, + { + "epoch": 0.12354491544036898, + "grad_norm": 2.2714662551879883, + "learning_rate": 6.169045005488474e-07, + "loss": 1.0498, + "mean_token_accuracy": 0.6861590147018433, + "num_tokens": 28664289.0, + "step": 1125 + }, + { + "epoch": 0.12365473314298264, + "grad_norm": 2.3724069595336914, + "learning_rate": 6.174533479692645e-07, + "loss": 0.9962, + "mean_token_accuracy": 0.7045583724975586, + "num_tokens": 28687341.0, + "step": 1126 + }, + { + "epoch": 0.12376455084559632, + "grad_norm": 2.2883973121643066, + "learning_rate": 6.180021953896817e-07, + "loss": 1.0674, + "mean_token_accuracy": 0.6819264888763428, + "num_tokens": 28713783.0, + "step": 1127 + }, + { + "epoch": 0.12387436854820998, + "grad_norm": 2.12335467338562, + "learning_rate": 6.185510428100988e-07, + "loss": 1.1365, + "mean_token_accuracy": 0.6604010462760925, + "num_tokens": 28743620.0, + "step": 1128 + }, + { + "epoch": 0.12398418625082364, + "grad_norm": 2.1482183933258057, + "learning_rate": 6.190998902305159e-07, + "loss": 1.0481, + "mean_token_accuracy": 0.6916848421096802, + "num_tokens": 28771592.0, + "step": 1129 + }, + { + "epoch": 0.1240940039534373, + "grad_norm": 2.3576483726501465, + "learning_rate": 6.196487376509331e-07, + "loss": 1.022, + "mean_token_accuracy": 0.6937907338142395, + "num_tokens": 28794605.0, + "step": 1130 + }, + { + "epoch": 0.12420382165605096, + "grad_norm": 2.359503984451294, + "learning_rate": 6.201975850713501e-07, + "loss": 1.1222, + "mean_token_accuracy": 0.6658309698104858, + "num_tokens": 28818691.0, + "step": 1131 + }, + { + "epoch": 0.12431363935866462, + "grad_norm": 2.539041757583618, + "learning_rate": 6.207464324917672e-07, + "loss": 1.0874, + "mean_token_accuracy": 0.6841621398925781, + "num_tokens": 28843975.0, + "step": 1132 + }, + { + "epoch": 0.12442345706127828, + "grad_norm": 2.378431558609009, + "learning_rate": 6.212952799121843e-07, + "loss": 1.0541, + "mean_token_accuracy": 0.6847426295280457, + "num_tokens": 28867924.0, + "step": 1133 + }, + { + "epoch": 0.12453327476389194, + "grad_norm": 2.4406328201293945, + "learning_rate": 6.218441273326015e-07, + "loss": 1.0649, + "mean_token_accuracy": 0.6890493631362915, + "num_tokens": 28892281.0, + "step": 1134 + }, + { + "epoch": 0.1246430924665056, + "grad_norm": 2.380431652069092, + "learning_rate": 6.223929747530186e-07, + "loss": 1.0723, + "mean_token_accuracy": 0.6807019710540771, + "num_tokens": 28917736.0, + "step": 1135 + }, + { + "epoch": 0.12475291016911927, + "grad_norm": 2.4432907104492188, + "learning_rate": 6.229418221734357e-07, + "loss": 0.9881, + "mean_token_accuracy": 0.6997706890106201, + "num_tokens": 28939527.0, + "step": 1136 + }, + { + "epoch": 0.12486272787173293, + "grad_norm": 2.4839017391204834, + "learning_rate": 6.234906695938529e-07, + "loss": 1.1138, + "mean_token_accuracy": 0.6708190441131592, + "num_tokens": 28963887.0, + "step": 1137 + }, + { + "epoch": 0.12497254557434659, + "grad_norm": 2.5140492916107178, + "learning_rate": 6.2403951701427e-07, + "loss": 1.045, + "mean_token_accuracy": 0.6940056681632996, + "num_tokens": 28987030.0, + "step": 1138 + }, + { + "epoch": 0.12508236327696023, + "grad_norm": 2.2905983924865723, + "learning_rate": 6.245883644346871e-07, + "loss": 1.0029, + "mean_token_accuracy": 0.6979628801345825, + "num_tokens": 29013135.0, + "step": 1139 + }, + { + "epoch": 0.1251921809795739, + "grad_norm": 2.197751760482788, + "learning_rate": 6.251372118551043e-07, + "loss": 1.0304, + "mean_token_accuracy": 0.6947212219238281, + "num_tokens": 29039909.0, + "step": 1140 + }, + { + "epoch": 0.12530199868218758, + "grad_norm": 2.390179395675659, + "learning_rate": 6.256860592755214e-07, + "loss": 1.1222, + "mean_token_accuracy": 0.672807514667511, + "num_tokens": 29064247.0, + "step": 1141 + }, + { + "epoch": 0.12541181638480123, + "grad_norm": 2.623724937438965, + "learning_rate": 6.262349066959385e-07, + "loss": 1.1078, + "mean_token_accuracy": 0.6784490942955017, + "num_tokens": 29087138.0, + "step": 1142 + }, + { + "epoch": 0.1255216340874149, + "grad_norm": 2.3443593978881836, + "learning_rate": 6.267837541163557e-07, + "loss": 1.0355, + "mean_token_accuracy": 0.6823259592056274, + "num_tokens": 29112459.0, + "step": 1143 + }, + { + "epoch": 0.12563145179002855, + "grad_norm": 2.5772805213928223, + "learning_rate": 6.273326015367727e-07, + "loss": 1.0261, + "mean_token_accuracy": 0.6928291320800781, + "num_tokens": 29133725.0, + "step": 1144 + }, + { + "epoch": 0.12574126949264222, + "grad_norm": 2.57018780708313, + "learning_rate": 6.278814489571899e-07, + "loss": 1.064, + "mean_token_accuracy": 0.6884138584136963, + "num_tokens": 29154891.0, + "step": 1145 + }, + { + "epoch": 0.12585108719525587, + "grad_norm": 2.332170009613037, + "learning_rate": 6.284302963776071e-07, + "loss": 1.1211, + "mean_token_accuracy": 0.6734158992767334, + "num_tokens": 29180310.0, + "step": 1146 + }, + { + "epoch": 0.12596090489786954, + "grad_norm": 2.3779072761535645, + "learning_rate": 6.289791437980241e-07, + "loss": 0.9111, + "mean_token_accuracy": 0.7272445559501648, + "num_tokens": 29207272.0, + "step": 1147 + }, + { + "epoch": 0.1260707226004832, + "grad_norm": 2.4093875885009766, + "learning_rate": 6.295279912184413e-07, + "loss": 1.0841, + "mean_token_accuracy": 0.6770925521850586, + "num_tokens": 29230474.0, + "step": 1148 + }, + { + "epoch": 0.12618054030309686, + "grad_norm": 2.6982040405273438, + "learning_rate": 6.300768386388585e-07, + "loss": 1.0409, + "mean_token_accuracy": 0.690375566482544, + "num_tokens": 29250468.0, + "step": 1149 + }, + { + "epoch": 0.1262903580057105, + "grad_norm": 1.8811455965042114, + "learning_rate": 6.306256860592754e-07, + "loss": 1.0039, + "mean_token_accuracy": 0.6978287696838379, + "num_tokens": 29286128.0, + "step": 1150 + }, + { + "epoch": 0.12640017570832418, + "grad_norm": 2.1857900619506836, + "learning_rate": 6.311745334796926e-07, + "loss": 1.1162, + "mean_token_accuracy": 0.6708922982215881, + "num_tokens": 29315810.0, + "step": 1151 + }, + { + "epoch": 0.12650999341093785, + "grad_norm": 2.4261045455932617, + "learning_rate": 6.317233809001098e-07, + "loss": 1.0437, + "mean_token_accuracy": 0.6995351910591125, + "num_tokens": 29340515.0, + "step": 1152 + }, + { + "epoch": 0.1266198111135515, + "grad_norm": 2.230849266052246, + "learning_rate": 6.322722283205268e-07, + "loss": 1.1167, + "mean_token_accuracy": 0.6701124906539917, + "num_tokens": 29368304.0, + "step": 1153 + }, + { + "epoch": 0.12672962881616517, + "grad_norm": 2.8842782974243164, + "learning_rate": 6.32821075740944e-07, + "loss": 0.9694, + "mean_token_accuracy": 0.7025794386863708, + "num_tokens": 29387368.0, + "step": 1154 + }, + { + "epoch": 0.12683944651877882, + "grad_norm": 2.1427526473999023, + "learning_rate": 6.33369923161361e-07, + "loss": 1.0353, + "mean_token_accuracy": 0.6890696883201599, + "num_tokens": 29416182.0, + "step": 1155 + }, + { + "epoch": 0.1269492642213925, + "grad_norm": 2.1091103553771973, + "learning_rate": 6.339187705817782e-07, + "loss": 1.0491, + "mean_token_accuracy": 0.6908798813819885, + "num_tokens": 29444958.0, + "step": 1156 + }, + { + "epoch": 0.12705908192400614, + "grad_norm": 2.245326280593872, + "learning_rate": 6.344676180021954e-07, + "loss": 0.9426, + "mean_token_accuracy": 0.714836835861206, + "num_tokens": 29469889.0, + "step": 1157 + }, + { + "epoch": 0.1271688996266198, + "grad_norm": 2.3917086124420166, + "learning_rate": 6.350164654226124e-07, + "loss": 1.1068, + "mean_token_accuracy": 0.6735954284667969, + "num_tokens": 29494119.0, + "step": 1158 + }, + { + "epoch": 0.12727871732923346, + "grad_norm": 2.3784945011138916, + "learning_rate": 6.355653128430296e-07, + "loss": 1.0336, + "mean_token_accuracy": 0.6904230117797852, + "num_tokens": 29517989.0, + "step": 1159 + }, + { + "epoch": 0.12738853503184713, + "grad_norm": 2.260906219482422, + "learning_rate": 6.361141602634468e-07, + "loss": 1.026, + "mean_token_accuracy": 0.6917699575424194, + "num_tokens": 29544051.0, + "step": 1160 + }, + { + "epoch": 0.1274983527344608, + "grad_norm": 2.139403820037842, + "learning_rate": 6.366630076838638e-07, + "loss": 0.9829, + "mean_token_accuracy": 0.7020767331123352, + "num_tokens": 29572078.0, + "step": 1161 + }, + { + "epoch": 0.12760817043707445, + "grad_norm": 2.2455174922943115, + "learning_rate": 6.37211855104281e-07, + "loss": 1.0955, + "mean_token_accuracy": 0.6691274046897888, + "num_tokens": 29600075.0, + "step": 1162 + }, + { + "epoch": 0.12771798813968813, + "grad_norm": 2.287766933441162, + "learning_rate": 6.377607025246982e-07, + "loss": 1.0832, + "mean_token_accuracy": 0.6761844754219055, + "num_tokens": 29626121.0, + "step": 1163 + }, + { + "epoch": 0.12782780584230177, + "grad_norm": 2.246778726577759, + "learning_rate": 6.383095499451152e-07, + "loss": 1.0946, + "mean_token_accuracy": 0.6859602928161621, + "num_tokens": 29654870.0, + "step": 1164 + }, + { + "epoch": 0.12793762354491545, + "grad_norm": 2.1714658737182617, + "learning_rate": 6.388583973655324e-07, + "loss": 1.0574, + "mean_token_accuracy": 0.6865646839141846, + "num_tokens": 29684295.0, + "step": 1165 + }, + { + "epoch": 0.1280474412475291, + "grad_norm": 2.340688467025757, + "learning_rate": 6.394072447859495e-07, + "loss": 1.0448, + "mean_token_accuracy": 0.6866459846496582, + "num_tokens": 29709290.0, + "step": 1166 + }, + { + "epoch": 0.12815725895014277, + "grad_norm": 2.1716251373291016, + "learning_rate": 6.399560922063666e-07, + "loss": 0.9625, + "mean_token_accuracy": 0.7105631828308105, + "num_tokens": 29736532.0, + "step": 1167 + }, + { + "epoch": 0.1282670766527564, + "grad_norm": 2.339986801147461, + "learning_rate": 6.405049396267838e-07, + "loss": 1.0052, + "mean_token_accuracy": 0.7022390365600586, + "num_tokens": 29762620.0, + "step": 1168 + }, + { + "epoch": 0.12837689435537009, + "grad_norm": 2.4311742782592773, + "learning_rate": 6.410537870472008e-07, + "loss": 1.0322, + "mean_token_accuracy": 0.6918898224830627, + "num_tokens": 29785970.0, + "step": 1169 + }, + { + "epoch": 0.12848671205798376, + "grad_norm": 2.7314531803131104, + "learning_rate": 6.416026344676179e-07, + "loss": 1.0544, + "mean_token_accuracy": 0.6826190948486328, + "num_tokens": 29806203.0, + "step": 1170 + }, + { + "epoch": 0.1285965297605974, + "grad_norm": 2.2546820640563965, + "learning_rate": 6.421514818880351e-07, + "loss": 1.0045, + "mean_token_accuracy": 0.7033295035362244, + "num_tokens": 29834669.0, + "step": 1171 + }, + { + "epoch": 0.12870634746321108, + "grad_norm": 2.495382308959961, + "learning_rate": 6.427003293084522e-07, + "loss": 1.0056, + "mean_token_accuracy": 0.6915538907051086, + "num_tokens": 29856539.0, + "step": 1172 + }, + { + "epoch": 0.12881616516582473, + "grad_norm": 2.206167697906494, + "learning_rate": 6.432491767288693e-07, + "loss": 1.056, + "mean_token_accuracy": 0.6862185001373291, + "num_tokens": 29883102.0, + "step": 1173 + }, + { + "epoch": 0.1289259828684384, + "grad_norm": 2.0318803787231445, + "learning_rate": 6.437980241492865e-07, + "loss": 1.0899, + "mean_token_accuracy": 0.6769280433654785, + "num_tokens": 29914768.0, + "step": 1174 + }, + { + "epoch": 0.12903580057105205, + "grad_norm": 2.5419886112213135, + "learning_rate": 6.443468715697036e-07, + "loss": 1.0616, + "mean_token_accuracy": 0.6929397583007812, + "num_tokens": 29937805.0, + "step": 1175 + }, + { + "epoch": 0.12914561827366572, + "grad_norm": 2.332415819168091, + "learning_rate": 6.448957189901207e-07, + "loss": 1.0518, + "mean_token_accuracy": 0.7018187046051025, + "num_tokens": 29962432.0, + "step": 1176 + }, + { + "epoch": 0.12925543597627936, + "grad_norm": 2.6153769493103027, + "learning_rate": 6.454445664105378e-07, + "loss": 1.0719, + "mean_token_accuracy": 0.6917903423309326, + "num_tokens": 29987188.0, + "step": 1177 + }, + { + "epoch": 0.12936525367889304, + "grad_norm": 2.507232904434204, + "learning_rate": 6.45993413830955e-07, + "loss": 1.0307, + "mean_token_accuracy": 0.6994490027427673, + "num_tokens": 30009414.0, + "step": 1178 + }, + { + "epoch": 0.1294750713815067, + "grad_norm": 2.6970374584198, + "learning_rate": 6.465422612513721e-07, + "loss": 1.0401, + "mean_token_accuracy": 0.688176691532135, + "num_tokens": 30030041.0, + "step": 1179 + }, + { + "epoch": 0.12958488908412036, + "grad_norm": 2.5277676582336426, + "learning_rate": 6.470911086717892e-07, + "loss": 1.0798, + "mean_token_accuracy": 0.6820120215415955, + "num_tokens": 30052907.0, + "step": 1180 + }, + { + "epoch": 0.12969470678673403, + "grad_norm": 2.1625816822052, + "learning_rate": 6.476399560922064e-07, + "loss": 1.1486, + "mean_token_accuracy": 0.6645796895027161, + "num_tokens": 30083919.0, + "step": 1181 + }, + { + "epoch": 0.12980452448934768, + "grad_norm": 2.701779365539551, + "learning_rate": 6.481888035126235e-07, + "loss": 1.0134, + "mean_token_accuracy": 0.7021280527114868, + "num_tokens": 30104769.0, + "step": 1182 + }, + { + "epoch": 0.12991434219196135, + "grad_norm": 2.5046327114105225, + "learning_rate": 6.487376509330406e-07, + "loss": 0.9916, + "mean_token_accuracy": 0.7061792612075806, + "num_tokens": 30127698.0, + "step": 1183 + }, + { + "epoch": 0.130024159894575, + "grad_norm": 2.437541961669922, + "learning_rate": 6.492864983534578e-07, + "loss": 1.1613, + "mean_token_accuracy": 0.6736527681350708, + "num_tokens": 30155309.0, + "step": 1184 + }, + { + "epoch": 0.13013397759718867, + "grad_norm": 2.3878581523895264, + "learning_rate": 6.498353457738749e-07, + "loss": 0.9998, + "mean_token_accuracy": 0.7014238834381104, + "num_tokens": 30177728.0, + "step": 1185 + }, + { + "epoch": 0.13024379529980232, + "grad_norm": 2.107811689376831, + "learning_rate": 6.50384193194292e-07, + "loss": 1.0481, + "mean_token_accuracy": 0.6945797204971313, + "num_tokens": 30206667.0, + "step": 1186 + }, + { + "epoch": 0.130353613002416, + "grad_norm": 2.497044563293457, + "learning_rate": 6.509330406147092e-07, + "loss": 1.0488, + "mean_token_accuracy": 0.6839106678962708, + "num_tokens": 30230793.0, + "step": 1187 + }, + { + "epoch": 0.13046343070502964, + "grad_norm": 3.6519415378570557, + "learning_rate": 6.514818880351261e-07, + "loss": 0.9342, + "mean_token_accuracy": 0.717467188835144, + "num_tokens": 30246328.0, + "step": 1188 + }, + { + "epoch": 0.1305732484076433, + "grad_norm": 2.69301438331604, + "learning_rate": 6.520307354555433e-07, + "loss": 1.1151, + "mean_token_accuracy": 0.6830857992172241, + "num_tokens": 30266979.0, + "step": 1189 + }, + { + "epoch": 0.13068306611025698, + "grad_norm": 2.4704599380493164, + "learning_rate": 6.525795828759604e-07, + "loss": 1.0034, + "mean_token_accuracy": 0.6955631971359253, + "num_tokens": 30289516.0, + "step": 1190 + }, + { + "epoch": 0.13079288381287063, + "grad_norm": 2.48382568359375, + "learning_rate": 6.531284302963775e-07, + "loss": 1.0535, + "mean_token_accuracy": 0.6942048072814941, + "num_tokens": 30312285.0, + "step": 1191 + }, + { + "epoch": 0.1309027015154843, + "grad_norm": 2.4783880710601807, + "learning_rate": 6.536772777167947e-07, + "loss": 0.9903, + "mean_token_accuracy": 0.7032691240310669, + "num_tokens": 30335102.0, + "step": 1192 + }, + { + "epoch": 0.13101251921809795, + "grad_norm": 2.3573665618896484, + "learning_rate": 6.542261251372118e-07, + "loss": 1.0583, + "mean_token_accuracy": 0.6923342943191528, + "num_tokens": 30360054.0, + "step": 1193 + }, + { + "epoch": 0.13112233692071162, + "grad_norm": 2.385044813156128, + "learning_rate": 6.547749725576289e-07, + "loss": 1.0407, + "mean_token_accuracy": 0.6967596411705017, + "num_tokens": 30384345.0, + "step": 1194 + }, + { + "epoch": 0.13123215462332527, + "grad_norm": 2.329972982406616, + "learning_rate": 6.553238199780461e-07, + "loss": 1.1121, + "mean_token_accuracy": 0.6795937418937683, + "num_tokens": 30411598.0, + "step": 1195 + }, + { + "epoch": 0.13134197232593894, + "grad_norm": 2.544886827468872, + "learning_rate": 6.558726673984632e-07, + "loss": 1.0516, + "mean_token_accuracy": 0.6891261339187622, + "num_tokens": 30434465.0, + "step": 1196 + }, + { + "epoch": 0.1314517900285526, + "grad_norm": 2.3682801723480225, + "learning_rate": 6.564215148188803e-07, + "loss": 1.0489, + "mean_token_accuracy": 0.6885073184967041, + "num_tokens": 30458685.0, + "step": 1197 + }, + { + "epoch": 0.13156160773116626, + "grad_norm": 2.52835750579834, + "learning_rate": 6.569703622392975e-07, + "loss": 1.0195, + "mean_token_accuracy": 0.6917128562927246, + "num_tokens": 30481030.0, + "step": 1198 + }, + { + "epoch": 0.13167142543377994, + "grad_norm": 2.2801620960235596, + "learning_rate": 6.575192096597145e-07, + "loss": 0.9543, + "mean_token_accuracy": 0.7143297791481018, + "num_tokens": 30504895.0, + "step": 1199 + }, + { + "epoch": 0.13178124313639358, + "grad_norm": 2.4216229915618896, + "learning_rate": 6.580680570801317e-07, + "loss": 1.0081, + "mean_token_accuracy": 0.6939644813537598, + "num_tokens": 30527948.0, + "step": 1200 + }, + { + "epoch": 0.13189106083900726, + "grad_norm": 2.1326019763946533, + "learning_rate": 6.586169045005489e-07, + "loss": 0.9824, + "mean_token_accuracy": 0.7020478248596191, + "num_tokens": 30554876.0, + "step": 1201 + }, + { + "epoch": 0.1320008785416209, + "grad_norm": 2.2559757232666016, + "learning_rate": 6.591657519209659e-07, + "loss": 1.013, + "mean_token_accuracy": 0.7048732042312622, + "num_tokens": 30579991.0, + "step": 1202 + }, + { + "epoch": 0.13211069624423458, + "grad_norm": 2.111412286758423, + "learning_rate": 6.597145993413831e-07, + "loss": 1.1106, + "mean_token_accuracy": 0.6695999503135681, + "num_tokens": 30610730.0, + "step": 1203 + }, + { + "epoch": 0.13222051394684822, + "grad_norm": 2.25353741645813, + "learning_rate": 6.602634467618003e-07, + "loss": 1.0302, + "mean_token_accuracy": 0.6932410001754761, + "num_tokens": 30639002.0, + "step": 1204 + }, + { + "epoch": 0.1323303316494619, + "grad_norm": 1.922154426574707, + "learning_rate": 6.608122941822173e-07, + "loss": 1.0718, + "mean_token_accuracy": 0.6772156953811646, + "num_tokens": 30676633.0, + "step": 1205 + }, + { + "epoch": 0.13244014935207554, + "grad_norm": 2.3078932762145996, + "learning_rate": 6.613611416026345e-07, + "loss": 1.0141, + "mean_token_accuracy": 0.6896341443061829, + "num_tokens": 30702522.0, + "step": 1206 + }, + { + "epoch": 0.13254996705468922, + "grad_norm": 2.222733736038208, + "learning_rate": 6.619099890230515e-07, + "loss": 1.0543, + "mean_token_accuracy": 0.6828552484512329, + "num_tokens": 30730806.0, + "step": 1207 + }, + { + "epoch": 0.1326597847573029, + "grad_norm": 2.1602039337158203, + "learning_rate": 6.624588364434686e-07, + "loss": 1.108, + "mean_token_accuracy": 0.6785627603530884, + "num_tokens": 30759289.0, + "step": 1208 + }, + { + "epoch": 0.13276960245991654, + "grad_norm": 2.5682075023651123, + "learning_rate": 6.630076838638858e-07, + "loss": 1.0464, + "mean_token_accuracy": 0.6874816417694092, + "num_tokens": 30783385.0, + "step": 1209 + }, + { + "epoch": 0.1328794201625302, + "grad_norm": 2.1474950313568115, + "learning_rate": 6.635565312843029e-07, + "loss": 1.0554, + "mean_token_accuracy": 0.6979379057884216, + "num_tokens": 30811318.0, + "step": 1210 + }, + { + "epoch": 0.13298923786514386, + "grad_norm": 2.027069330215454, + "learning_rate": 6.6410537870472e-07, + "loss": 1.0826, + "mean_token_accuracy": 0.6821612119674683, + "num_tokens": 30843719.0, + "step": 1211 + }, + { + "epoch": 0.13309905556775753, + "grad_norm": 2.077916383743286, + "learning_rate": 6.646542261251372e-07, + "loss": 1.0879, + "mean_token_accuracy": 0.6782204508781433, + "num_tokens": 30875728.0, + "step": 1212 + }, + { + "epoch": 0.13320887327037118, + "grad_norm": 2.3195106983184814, + "learning_rate": 6.652030735455543e-07, + "loss": 1.0051, + "mean_token_accuracy": 0.6956944465637207, + "num_tokens": 30901913.0, + "step": 1213 + }, + { + "epoch": 0.13331869097298485, + "grad_norm": 2.4394278526306152, + "learning_rate": 6.657519209659714e-07, + "loss": 1.0429, + "mean_token_accuracy": 0.6990418434143066, + "num_tokens": 30925681.0, + "step": 1214 + }, + { + "epoch": 0.1334285086755985, + "grad_norm": 2.585064172744751, + "learning_rate": 6.663007683863886e-07, + "loss": 1.032, + "mean_token_accuracy": 0.6972388029098511, + "num_tokens": 30947377.0, + "step": 1215 + }, + { + "epoch": 0.13353832637821217, + "grad_norm": 2.562422513961792, + "learning_rate": 6.668496158068057e-07, + "loss": 1.0703, + "mean_token_accuracy": 0.683283805847168, + "num_tokens": 30970510.0, + "step": 1216 + }, + { + "epoch": 0.13364814408082584, + "grad_norm": 2.323390483856201, + "learning_rate": 6.673984632272228e-07, + "loss": 0.9761, + "mean_token_accuracy": 0.6996710300445557, + "num_tokens": 30994384.0, + "step": 1217 + }, + { + "epoch": 0.1337579617834395, + "grad_norm": 2.429476737976074, + "learning_rate": 6.679473106476399e-07, + "loss": 0.9995, + "mean_token_accuracy": 0.7008026838302612, + "num_tokens": 31017432.0, + "step": 1218 + }, + { + "epoch": 0.13386777948605316, + "grad_norm": 2.175537347793579, + "learning_rate": 6.684961580680571e-07, + "loss": 1.0663, + "mean_token_accuracy": 0.6768872141838074, + "num_tokens": 31045962.0, + "step": 1219 + }, + { + "epoch": 0.1339775971886668, + "grad_norm": 2.1940970420837402, + "learning_rate": 6.690450054884742e-07, + "loss": 0.9633, + "mean_token_accuracy": 0.7112233638763428, + "num_tokens": 31074265.0, + "step": 1220 + }, + { + "epoch": 0.13408741489128048, + "grad_norm": 2.2546818256378174, + "learning_rate": 6.695938529088913e-07, + "loss": 1.0313, + "mean_token_accuracy": 0.689365565776825, + "num_tokens": 31100521.0, + "step": 1221 + }, + { + "epoch": 0.13419723259389413, + "grad_norm": 2.484203338623047, + "learning_rate": 6.701427003293085e-07, + "loss": 1.0364, + "mean_token_accuracy": 0.6934523582458496, + "num_tokens": 31124264.0, + "step": 1222 + }, + { + "epoch": 0.1343070502965078, + "grad_norm": 2.287973165512085, + "learning_rate": 6.706915477497256e-07, + "loss": 1.0288, + "mean_token_accuracy": 0.6925603151321411, + "num_tokens": 31150964.0, + "step": 1223 + }, + { + "epoch": 0.13441686799912145, + "grad_norm": 2.5101747512817383, + "learning_rate": 6.712403951701427e-07, + "loss": 1.135, + "mean_token_accuracy": 0.6660556793212891, + "num_tokens": 31175194.0, + "step": 1224 + }, + { + "epoch": 0.13452668570173512, + "grad_norm": 2.196347236633301, + "learning_rate": 6.717892425905599e-07, + "loss": 1.1365, + "mean_token_accuracy": 0.6620454788208008, + "num_tokens": 31204336.0, + "step": 1225 + }, + { + "epoch": 0.13463650340434877, + "grad_norm": 2.7177329063415527, + "learning_rate": 6.723380900109769e-07, + "loss": 0.9238, + "mean_token_accuracy": 0.7264748215675354, + "num_tokens": 31223180.0, + "step": 1226 + }, + { + "epoch": 0.13474632110696244, + "grad_norm": 2.7049152851104736, + "learning_rate": 6.72886937431394e-07, + "loss": 1.0527, + "mean_token_accuracy": 0.6903998851776123, + "num_tokens": 31245968.0, + "step": 1227 + }, + { + "epoch": 0.13485613880957611, + "grad_norm": 2.070117235183716, + "learning_rate": 6.734357848518111e-07, + "loss": 1.0552, + "mean_token_accuracy": 0.6883732676506042, + "num_tokens": 31275969.0, + "step": 1228 + }, + { + "epoch": 0.13496595651218976, + "grad_norm": 2.2176082134246826, + "learning_rate": 6.739846322722282e-07, + "loss": 1.037, + "mean_token_accuracy": 0.685396671295166, + "num_tokens": 31305819.0, + "step": 1229 + }, + { + "epoch": 0.13507577421480343, + "grad_norm": 2.307389259338379, + "learning_rate": 6.745334796926454e-07, + "loss": 1.0692, + "mean_token_accuracy": 0.6830418109893799, + "num_tokens": 31331710.0, + "step": 1230 + }, + { + "epoch": 0.13518559191741708, + "grad_norm": 2.047257900238037, + "learning_rate": 6.750823271130625e-07, + "loss": 1.1207, + "mean_token_accuracy": 0.6659562587738037, + "num_tokens": 31363524.0, + "step": 1231 + }, + { + "epoch": 0.13529540962003075, + "grad_norm": 2.022742509841919, + "learning_rate": 6.756311745334796e-07, + "loss": 1.0457, + "mean_token_accuracy": 0.6873575448989868, + "num_tokens": 31394358.0, + "step": 1232 + }, + { + "epoch": 0.1354052273226444, + "grad_norm": 2.458885669708252, + "learning_rate": 6.761800219538968e-07, + "loss": 1.1154, + "mean_token_accuracy": 0.6789083480834961, + "num_tokens": 31418635.0, + "step": 1233 + }, + { + "epoch": 0.13551504502525807, + "grad_norm": 2.1934094429016113, + "learning_rate": 6.767288693743139e-07, + "loss": 1.0725, + "mean_token_accuracy": 0.6879323720932007, + "num_tokens": 31446236.0, + "step": 1234 + }, + { + "epoch": 0.13562486272787172, + "grad_norm": 2.3296003341674805, + "learning_rate": 6.77277716794731e-07, + "loss": 0.9687, + "mean_token_accuracy": 0.712737500667572, + "num_tokens": 31471646.0, + "step": 1235 + }, + { + "epoch": 0.1357346804304854, + "grad_norm": 2.338712453842163, + "learning_rate": 6.778265642151482e-07, + "loss": 1.0649, + "mean_token_accuracy": 0.6891981363296509, + "num_tokens": 31496727.0, + "step": 1236 + }, + { + "epoch": 0.13584449813309907, + "grad_norm": 2.1810827255249023, + "learning_rate": 6.783754116355653e-07, + "loss": 1.076, + "mean_token_accuracy": 0.6824997663497925, + "num_tokens": 31525745.0, + "step": 1237 + }, + { + "epoch": 0.1359543158357127, + "grad_norm": 2.31079363822937, + "learning_rate": 6.789242590559824e-07, + "loss": 1.1225, + "mean_token_accuracy": 0.6709499955177307, + "num_tokens": 31552608.0, + "step": 1238 + }, + { + "epoch": 0.1360641335383264, + "grad_norm": 2.4396212100982666, + "learning_rate": 6.794731064763996e-07, + "loss": 1.0333, + "mean_token_accuracy": 0.688383936882019, + "num_tokens": 31576178.0, + "step": 1239 + }, + { + "epoch": 0.13617395124094003, + "grad_norm": 2.4315707683563232, + "learning_rate": 6.800219538968166e-07, + "loss": 0.9941, + "mean_token_accuracy": 0.6979539394378662, + "num_tokens": 31598390.0, + "step": 1240 + }, + { + "epoch": 0.1362837689435537, + "grad_norm": 2.1703052520751953, + "learning_rate": 6.805708013172338e-07, + "loss": 0.9795, + "mean_token_accuracy": 0.7026612758636475, + "num_tokens": 31625990.0, + "step": 1241 + }, + { + "epoch": 0.13639358664616735, + "grad_norm": 2.7345833778381348, + "learning_rate": 6.81119648737651e-07, + "loss": 1.0057, + "mean_token_accuracy": 0.6969124674797058, + "num_tokens": 31645003.0, + "step": 1242 + }, + { + "epoch": 0.13650340434878103, + "grad_norm": 2.303431749343872, + "learning_rate": 6.81668496158068e-07, + "loss": 1.0459, + "mean_token_accuracy": 0.6854749917984009, + "num_tokens": 31671900.0, + "step": 1243 + }, + { + "epoch": 0.13661322205139467, + "grad_norm": 2.2421038150787354, + "learning_rate": 6.822173435784852e-07, + "loss": 1.039, + "mean_token_accuracy": 0.6896024942398071, + "num_tokens": 31698429.0, + "step": 1244 + }, + { + "epoch": 0.13672303975400835, + "grad_norm": 2.430774688720703, + "learning_rate": 6.827661909989023e-07, + "loss": 1.0206, + "mean_token_accuracy": 0.6938161253929138, + "num_tokens": 31722009.0, + "step": 1245 + }, + { + "epoch": 0.13683285745662202, + "grad_norm": 2.3466169834136963, + "learning_rate": 6.833150384193193e-07, + "loss": 1.1294, + "mean_token_accuracy": 0.6798732876777649, + "num_tokens": 31751119.0, + "step": 1246 + }, + { + "epoch": 0.13694267515923567, + "grad_norm": 2.2844231128692627, + "learning_rate": 6.838638858397365e-07, + "loss": 1.0876, + "mean_token_accuracy": 0.6815018653869629, + "num_tokens": 31780032.0, + "step": 1247 + }, + { + "epoch": 0.13705249286184934, + "grad_norm": 2.821206569671631, + "learning_rate": 6.844127332601537e-07, + "loss": 0.9489, + "mean_token_accuracy": 0.7105352878570557, + "num_tokens": 31798862.0, + "step": 1248 + }, + { + "epoch": 0.13716231056446299, + "grad_norm": 2.135842800140381, + "learning_rate": 6.849615806805707e-07, + "loss": 1.1514, + "mean_token_accuracy": 0.6697074770927429, + "num_tokens": 31831999.0, + "step": 1249 + }, + { + "epoch": 0.13727212826707666, + "grad_norm": 2.5555953979492188, + "learning_rate": 6.855104281009879e-07, + "loss": 1.029, + "mean_token_accuracy": 0.6939056515693665, + "num_tokens": 31855735.0, + "step": 1250 + }, + { + "epoch": 0.1373819459696903, + "grad_norm": 2.513615131378174, + "learning_rate": 6.86059275521405e-07, + "loss": 1.0853, + "mean_token_accuracy": 0.6702587604522705, + "num_tokens": 31878670.0, + "step": 1251 + }, + { + "epoch": 0.13749176367230398, + "grad_norm": 2.3936476707458496, + "learning_rate": 6.866081229418221e-07, + "loss": 1.0707, + "mean_token_accuracy": 0.6830839514732361, + "num_tokens": 31903640.0, + "step": 1252 + }, + { + "epoch": 0.13760158137491763, + "grad_norm": 2.3329226970672607, + "learning_rate": 6.871569703622393e-07, + "loss": 0.9425, + "mean_token_accuracy": 0.7114503383636475, + "num_tokens": 31927238.0, + "step": 1253 + }, + { + "epoch": 0.1377113990775313, + "grad_norm": 2.09138822555542, + "learning_rate": 6.877058177826564e-07, + "loss": 1.0883, + "mean_token_accuracy": 0.6825312972068787, + "num_tokens": 31958802.0, + "step": 1254 + }, + { + "epoch": 0.13782121678014497, + "grad_norm": 2.3067820072174072, + "learning_rate": 6.882546652030735e-07, + "loss": 0.9472, + "mean_token_accuracy": 0.717106282711029, + "num_tokens": 31984972.0, + "step": 1255 + }, + { + "epoch": 0.13793103448275862, + "grad_norm": 2.1918108463287354, + "learning_rate": 6.888035126234907e-07, + "loss": 1.091, + "mean_token_accuracy": 0.6833152770996094, + "num_tokens": 32011820.0, + "step": 1256 + }, + { + "epoch": 0.1380408521853723, + "grad_norm": 1.9174036979675293, + "learning_rate": 6.893523600439078e-07, + "loss": 1.0476, + "mean_token_accuracy": 0.6882820129394531, + "num_tokens": 32044356.0, + "step": 1257 + }, + { + "epoch": 0.13815066988798594, + "grad_norm": 2.3829548358917236, + "learning_rate": 6.899012074643249e-07, + "loss": 1.066, + "mean_token_accuracy": 0.6847816705703735, + "num_tokens": 32071286.0, + "step": 1258 + }, + { + "epoch": 0.1382604875905996, + "grad_norm": 2.459289073944092, + "learning_rate": 6.904500548847421e-07, + "loss": 1.0562, + "mean_token_accuracy": 0.6866821050643921, + "num_tokens": 32095844.0, + "step": 1259 + }, + { + "epoch": 0.13837030529321326, + "grad_norm": 2.4124066829681396, + "learning_rate": 6.909989023051592e-07, + "loss": 1.1159, + "mean_token_accuracy": 0.6657267808914185, + "num_tokens": 32120622.0, + "step": 1260 + }, + { + "epoch": 0.13848012299582693, + "grad_norm": 2.572037696838379, + "learning_rate": 6.915477497255763e-07, + "loss": 1.0682, + "mean_token_accuracy": 0.6790652275085449, + "num_tokens": 32143153.0, + "step": 1261 + }, + { + "epoch": 0.13858994069844058, + "grad_norm": 2.7213287353515625, + "learning_rate": 6.920965971459934e-07, + "loss": 1.0995, + "mean_token_accuracy": 0.676389217376709, + "num_tokens": 32164839.0, + "step": 1262 + }, + { + "epoch": 0.13869975840105425, + "grad_norm": 2.224069356918335, + "learning_rate": 6.926454445664105e-07, + "loss": 1.0301, + "mean_token_accuracy": 0.6894317865371704, + "num_tokens": 32192725.0, + "step": 1263 + }, + { + "epoch": 0.1388095761036679, + "grad_norm": 2.417750835418701, + "learning_rate": 6.931942919868276e-07, + "loss": 0.9887, + "mean_token_accuracy": 0.6984099745750427, + "num_tokens": 32216103.0, + "step": 1264 + }, + { + "epoch": 0.13891939380628157, + "grad_norm": 2.321413993835449, + "learning_rate": 6.937431394072447e-07, + "loss": 1.0851, + "mean_token_accuracy": 0.6886981129646301, + "num_tokens": 32244197.0, + "step": 1265 + }, + { + "epoch": 0.13902921150889525, + "grad_norm": 2.4029669761657715, + "learning_rate": 6.942919868276618e-07, + "loss": 0.9997, + "mean_token_accuracy": 0.7034192681312561, + "num_tokens": 32268090.0, + "step": 1266 + }, + { + "epoch": 0.1391390292115089, + "grad_norm": 2.5263712406158447, + "learning_rate": 6.94840834248079e-07, + "loss": 1.0783, + "mean_token_accuracy": 0.6805779337882996, + "num_tokens": 32291226.0, + "step": 1267 + }, + { + "epoch": 0.13924884691412256, + "grad_norm": 2.1281144618988037, + "learning_rate": 6.953896816684961e-07, + "loss": 1.0652, + "mean_token_accuracy": 0.6875847578048706, + "num_tokens": 32321553.0, + "step": 1268 + }, + { + "epoch": 0.1393586646167362, + "grad_norm": 2.391331911087036, + "learning_rate": 6.959385290889132e-07, + "loss": 0.9275, + "mean_token_accuracy": 0.7188643217086792, + "num_tokens": 32345638.0, + "step": 1269 + }, + { + "epoch": 0.13946848231934988, + "grad_norm": 2.2564992904663086, + "learning_rate": 6.964873765093304e-07, + "loss": 1.1017, + "mean_token_accuracy": 0.6746236085891724, + "num_tokens": 32374977.0, + "step": 1270 + }, + { + "epoch": 0.13957830002196353, + "grad_norm": 2.3218941688537598, + "learning_rate": 6.970362239297475e-07, + "loss": 0.9366, + "mean_token_accuracy": 0.7243694067001343, + "num_tokens": 32397855.0, + "step": 1271 + }, + { + "epoch": 0.1396881177245772, + "grad_norm": 2.3230133056640625, + "learning_rate": 6.975850713501646e-07, + "loss": 1.094, + "mean_token_accuracy": 0.6821996569633484, + "num_tokens": 32425311.0, + "step": 1272 + }, + { + "epoch": 0.13979793542719085, + "grad_norm": 2.151118755340576, + "learning_rate": 6.981339187705817e-07, + "loss": 1.0423, + "mean_token_accuracy": 0.6966368556022644, + "num_tokens": 32452921.0, + "step": 1273 + }, + { + "epoch": 0.13990775312980452, + "grad_norm": 2.0326969623565674, + "learning_rate": 6.986827661909989e-07, + "loss": 1.1108, + "mean_token_accuracy": 0.6661593914031982, + "num_tokens": 32486603.0, + "step": 1274 + }, + { + "epoch": 0.1400175708324182, + "grad_norm": 2.603623151779175, + "learning_rate": 6.99231613611416e-07, + "loss": 1.0204, + "mean_token_accuracy": 0.6986958384513855, + "num_tokens": 32507825.0, + "step": 1275 + }, + { + "epoch": 0.14012738853503184, + "grad_norm": 2.4673051834106445, + "learning_rate": 6.997804610318331e-07, + "loss": 1.0606, + "mean_token_accuracy": 0.698691189289093, + "num_tokens": 32530516.0, + "step": 1276 + }, + { + "epoch": 0.14023720623764552, + "grad_norm": 2.260955333709717, + "learning_rate": 7.003293084522503e-07, + "loss": 1.1388, + "mean_token_accuracy": 0.6659489870071411, + "num_tokens": 32558518.0, + "step": 1277 + }, + { + "epoch": 0.14034702394025916, + "grad_norm": 2.7081339359283447, + "learning_rate": 7.008781558726674e-07, + "loss": 1.0516, + "mean_token_accuracy": 0.6831269264221191, + "num_tokens": 32577939.0, + "step": 1278 + }, + { + "epoch": 0.14045684164287284, + "grad_norm": 2.274188995361328, + "learning_rate": 7.014270032930845e-07, + "loss": 1.0394, + "mean_token_accuracy": 0.6970707774162292, + "num_tokens": 32604109.0, + "step": 1279 + }, + { + "epoch": 0.14056665934548648, + "grad_norm": 2.4091737270355225, + "learning_rate": 7.019758507135017e-07, + "loss": 0.9859, + "mean_token_accuracy": 0.6988632678985596, + "num_tokens": 32626145.0, + "step": 1280 + }, + { + "epoch": 0.14067647704810016, + "grad_norm": 2.2795844078063965, + "learning_rate": 7.025246981339188e-07, + "loss": 0.9938, + "mean_token_accuracy": 0.7063440084457397, + "num_tokens": 32651061.0, + "step": 1281 + }, + { + "epoch": 0.1407862947507138, + "grad_norm": 2.2949180603027344, + "learning_rate": 7.030735455543358e-07, + "loss": 0.996, + "mean_token_accuracy": 0.7038044333457947, + "num_tokens": 32675054.0, + "step": 1282 + }, + { + "epoch": 0.14089611245332748, + "grad_norm": 2.4185550212860107, + "learning_rate": 7.03622392974753e-07, + "loss": 1.0028, + "mean_token_accuracy": 0.7043367028236389, + "num_tokens": 32698768.0, + "step": 1283 + }, + { + "epoch": 0.14100593015594115, + "grad_norm": 2.389267921447754, + "learning_rate": 7.0417124039517e-07, + "loss": 1.0191, + "mean_token_accuracy": 0.6993314623832703, + "num_tokens": 32722094.0, + "step": 1284 + }, + { + "epoch": 0.1411157478585548, + "grad_norm": 2.5633580684661865, + "learning_rate": 7.047200878155872e-07, + "loss": 1.0971, + "mean_token_accuracy": 0.6854774951934814, + "num_tokens": 32743276.0, + "step": 1285 + }, + { + "epoch": 0.14122556556116847, + "grad_norm": 2.5214691162109375, + "learning_rate": 7.052689352360044e-07, + "loss": 0.9856, + "mean_token_accuracy": 0.7001399993896484, + "num_tokens": 32765599.0, + "step": 1286 + }, + { + "epoch": 0.14133538326378212, + "grad_norm": 2.2863609790802, + "learning_rate": 7.058177826564214e-07, + "loss": 1.0769, + "mean_token_accuracy": 0.6913025379180908, + "num_tokens": 32791595.0, + "step": 1287 + }, + { + "epoch": 0.1414452009663958, + "grad_norm": 2.402698040008545, + "learning_rate": 7.063666300768386e-07, + "loss": 1.1443, + "mean_token_accuracy": 0.6623103618621826, + "num_tokens": 32820464.0, + "step": 1288 + }, + { + "epoch": 0.14155501866900944, + "grad_norm": 2.1533613204956055, + "learning_rate": 7.069154774972558e-07, + "loss": 1.0895, + "mean_token_accuracy": 0.677680492401123, + "num_tokens": 32850487.0, + "step": 1289 + }, + { + "epoch": 0.1416648363716231, + "grad_norm": 2.3076870441436768, + "learning_rate": 7.074643249176728e-07, + "loss": 0.9979, + "mean_token_accuracy": 0.6993193030357361, + "num_tokens": 32875071.0, + "step": 1290 + }, + { + "epoch": 0.14177465407423676, + "grad_norm": 2.467302083969116, + "learning_rate": 7.0801317233809e-07, + "loss": 0.9569, + "mean_token_accuracy": 0.7173880934715271, + "num_tokens": 32896193.0, + "step": 1291 + }, + { + "epoch": 0.14188447177685043, + "grad_norm": 2.462648868560791, + "learning_rate": 7.085620197585072e-07, + "loss": 1.1034, + "mean_token_accuracy": 0.676721453666687, + "num_tokens": 32922080.0, + "step": 1292 + }, + { + "epoch": 0.1419942894794641, + "grad_norm": 1.927240014076233, + "learning_rate": 7.091108671789242e-07, + "loss": 1.1061, + "mean_token_accuracy": 0.6739526987075806, + "num_tokens": 32956205.0, + "step": 1293 + }, + { + "epoch": 0.14210410718207775, + "grad_norm": 2.257502317428589, + "learning_rate": 7.096597145993414e-07, + "loss": 1.0141, + "mean_token_accuracy": 0.6916544437408447, + "num_tokens": 32982433.0, + "step": 1294 + }, + { + "epoch": 0.14221392488469142, + "grad_norm": 2.5094122886657715, + "learning_rate": 7.102085620197584e-07, + "loss": 1.0827, + "mean_token_accuracy": 0.6681360602378845, + "num_tokens": 33004907.0, + "step": 1295 + }, + { + "epoch": 0.14232374258730507, + "grad_norm": 2.196605682373047, + "learning_rate": 7.107574094401756e-07, + "loss": 1.0075, + "mean_token_accuracy": 0.7069164514541626, + "num_tokens": 33030067.0, + "step": 1296 + }, + { + "epoch": 0.14243356028991874, + "grad_norm": 2.360175132751465, + "learning_rate": 7.113062568605928e-07, + "loss": 1.0184, + "mean_token_accuracy": 0.6901154518127441, + "num_tokens": 33053488.0, + "step": 1297 + }, + { + "epoch": 0.1425433779925324, + "grad_norm": 2.129232406616211, + "learning_rate": 7.118551042810098e-07, + "loss": 1.0569, + "mean_token_accuracy": 0.6844586133956909, + "num_tokens": 33080331.0, + "step": 1298 + }, + { + "epoch": 0.14265319569514606, + "grad_norm": 2.3392372131347656, + "learning_rate": 7.12403951701427e-07, + "loss": 1.1385, + "mean_token_accuracy": 0.6625628471374512, + "num_tokens": 33105806.0, + "step": 1299 + }, + { + "epoch": 0.1427630133977597, + "grad_norm": 2.217822313308716, + "learning_rate": 7.129527991218442e-07, + "loss": 1.0422, + "mean_token_accuracy": 0.6851252913475037, + "num_tokens": 33131195.0, + "step": 1300 + }, + { + "epoch": 0.14287283110037338, + "grad_norm": 2.2979989051818848, + "learning_rate": 7.135016465422611e-07, + "loss": 1.0388, + "mean_token_accuracy": 0.7020103931427002, + "num_tokens": 33156462.0, + "step": 1301 + }, + { + "epoch": 0.14298264880298703, + "grad_norm": 2.3476459980010986, + "learning_rate": 7.140504939626783e-07, + "loss": 0.9113, + "mean_token_accuracy": 0.7231253385543823, + "num_tokens": 33180199.0, + "step": 1302 + }, + { + "epoch": 0.1430924665056007, + "grad_norm": 2.1445815563201904, + "learning_rate": 7.145993413830955e-07, + "loss": 1.0304, + "mean_token_accuracy": 0.6979292035102844, + "num_tokens": 33208977.0, + "step": 1303 + }, + { + "epoch": 0.14320228420821438, + "grad_norm": 2.119905471801758, + "learning_rate": 7.151481888035125e-07, + "loss": 0.998, + "mean_token_accuracy": 0.7008439302444458, + "num_tokens": 33237571.0, + "step": 1304 + }, + { + "epoch": 0.14331210191082802, + "grad_norm": 2.277627468109131, + "learning_rate": 7.156970362239297e-07, + "loss": 0.9861, + "mean_token_accuracy": 0.7054558992385864, + "num_tokens": 33262173.0, + "step": 1305 + }, + { + "epoch": 0.1434219196134417, + "grad_norm": 2.007207155227661, + "learning_rate": 7.162458836443468e-07, + "loss": 1.0767, + "mean_token_accuracy": 0.6853607892990112, + "num_tokens": 33295651.0, + "step": 1306 + }, + { + "epoch": 0.14353173731605534, + "grad_norm": 2.3024442195892334, + "learning_rate": 7.167947310647639e-07, + "loss": 0.9749, + "mean_token_accuracy": 0.7283877730369568, + "num_tokens": 33319620.0, + "step": 1307 + }, + { + "epoch": 0.14364155501866901, + "grad_norm": 2.04559063911438, + "learning_rate": 7.173435784851811e-07, + "loss": 1.0971, + "mean_token_accuracy": 0.6795555949211121, + "num_tokens": 33350854.0, + "step": 1308 + }, + { + "epoch": 0.14375137272128266, + "grad_norm": 2.396899700164795, + "learning_rate": 7.178924259055982e-07, + "loss": 1.0417, + "mean_token_accuracy": 0.6890082359313965, + "num_tokens": 33374487.0, + "step": 1309 + }, + { + "epoch": 0.14386119042389633, + "grad_norm": 2.247833728790283, + "learning_rate": 7.184412733260153e-07, + "loss": 0.9123, + "mean_token_accuracy": 0.7244949340820312, + "num_tokens": 33402707.0, + "step": 1310 + }, + { + "epoch": 0.14397100812650998, + "grad_norm": 2.2900822162628174, + "learning_rate": 7.189901207464325e-07, + "loss": 1.0309, + "mean_token_accuracy": 0.6901326179504395, + "num_tokens": 33429105.0, + "step": 1311 + }, + { + "epoch": 0.14408082582912365, + "grad_norm": 2.2761383056640625, + "learning_rate": 7.195389681668496e-07, + "loss": 1.0695, + "mean_token_accuracy": 0.6798995733261108, + "num_tokens": 33453904.0, + "step": 1312 + }, + { + "epoch": 0.14419064353173733, + "grad_norm": 2.545114755630493, + "learning_rate": 7.200878155872667e-07, + "loss": 0.8908, + "mean_token_accuracy": 0.7259565591812134, + "num_tokens": 33472931.0, + "step": 1313 + }, + { + "epoch": 0.14430046123435097, + "grad_norm": 2.4084560871124268, + "learning_rate": 7.206366630076838e-07, + "loss": 1.0539, + "mean_token_accuracy": 0.6855303645133972, + "num_tokens": 33496535.0, + "step": 1314 + }, + { + "epoch": 0.14441027893696465, + "grad_norm": 2.4951210021972656, + "learning_rate": 7.21185510428101e-07, + "loss": 1.0262, + "mean_token_accuracy": 0.6987862586975098, + "num_tokens": 33520838.0, + "step": 1315 + }, + { + "epoch": 0.1445200966395783, + "grad_norm": 2.0199971199035645, + "learning_rate": 7.217343578485181e-07, + "loss": 0.9931, + "mean_token_accuracy": 0.7031884789466858, + "num_tokens": 33551702.0, + "step": 1316 + }, + { + "epoch": 0.14462991434219197, + "grad_norm": 2.2040297985076904, + "learning_rate": 7.222832052689352e-07, + "loss": 1.0027, + "mean_token_accuracy": 0.7055044174194336, + "num_tokens": 33578498.0, + "step": 1317 + }, + { + "epoch": 0.1447397320448056, + "grad_norm": 2.407269239425659, + "learning_rate": 7.228320526893524e-07, + "loss": 1.0493, + "mean_token_accuracy": 0.6965422630310059, + "num_tokens": 33603467.0, + "step": 1318 + }, + { + "epoch": 0.1448495497474193, + "grad_norm": 2.13639760017395, + "learning_rate": 7.233809001097695e-07, + "loss": 1.0729, + "mean_token_accuracy": 0.6806930303573608, + "num_tokens": 33633392.0, + "step": 1319 + }, + { + "epoch": 0.14495936745003293, + "grad_norm": 2.332578659057617, + "learning_rate": 7.239297475301865e-07, + "loss": 1.1194, + "mean_token_accuracy": 0.670099139213562, + "num_tokens": 33658500.0, + "step": 1320 + }, + { + "epoch": 0.1450691851526466, + "grad_norm": 1.867723822593689, + "learning_rate": 7.244785949506037e-07, + "loss": 1.1334, + "mean_token_accuracy": 0.6671434640884399, + "num_tokens": 33696213.0, + "step": 1321 + }, + { + "epoch": 0.14517900285526028, + "grad_norm": 2.537379741668701, + "learning_rate": 7.250274423710208e-07, + "loss": 1.1081, + "mean_token_accuracy": 0.676557183265686, + "num_tokens": 33719451.0, + "step": 1322 + }, + { + "epoch": 0.14528882055787393, + "grad_norm": 2.256850004196167, + "learning_rate": 7.255762897914379e-07, + "loss": 1.0668, + "mean_token_accuracy": 0.6851454973220825, + "num_tokens": 33746243.0, + "step": 1323 + }, + { + "epoch": 0.1453986382604876, + "grad_norm": 2.011866569519043, + "learning_rate": 7.261251372118551e-07, + "loss": 1.1172, + "mean_token_accuracy": 0.6793073415756226, + "num_tokens": 33779920.0, + "step": 1324 + }, + { + "epoch": 0.14550845596310125, + "grad_norm": 2.2368407249450684, + "learning_rate": 7.266739846322721e-07, + "loss": 1.1099, + "mean_token_accuracy": 0.6671556830406189, + "num_tokens": 33807662.0, + "step": 1325 + }, + { + "epoch": 0.14561827366571492, + "grad_norm": 2.3867719173431396, + "learning_rate": 7.272228320526893e-07, + "loss": 0.8968, + "mean_token_accuracy": 0.7267519235610962, + "num_tokens": 33831290.0, + "step": 1326 + }, + { + "epoch": 0.14572809136832857, + "grad_norm": 2.529881000518799, + "learning_rate": 7.277716794731065e-07, + "loss": 0.9725, + "mean_token_accuracy": 0.7039568424224854, + "num_tokens": 33853107.0, + "step": 1327 + }, + { + "epoch": 0.14583790907094224, + "grad_norm": 2.183500289916992, + "learning_rate": 7.283205268935235e-07, + "loss": 1.0109, + "mean_token_accuracy": 0.6971198916435242, + "num_tokens": 33880007.0, + "step": 1328 + }, + { + "epoch": 0.14594772677355589, + "grad_norm": 2.5114495754241943, + "learning_rate": 7.288693743139407e-07, + "loss": 0.998, + "mean_token_accuracy": 0.6953231692314148, + "num_tokens": 33902072.0, + "step": 1329 + }, + { + "epoch": 0.14605754447616956, + "grad_norm": 2.383419990539551, + "learning_rate": 7.294182217343579e-07, + "loss": 0.9962, + "mean_token_accuracy": 0.6973929405212402, + "num_tokens": 33925686.0, + "step": 1330 + }, + { + "epoch": 0.14616736217878323, + "grad_norm": 2.231046199798584, + "learning_rate": 7.299670691547749e-07, + "loss": 1.0826, + "mean_token_accuracy": 0.684705376625061, + "num_tokens": 33951478.0, + "step": 1331 + }, + { + "epoch": 0.14627717988139688, + "grad_norm": 2.5260074138641357, + "learning_rate": 7.305159165751921e-07, + "loss": 1.0167, + "mean_token_accuracy": 0.6969327330589294, + "num_tokens": 33974158.0, + "step": 1332 + }, + { + "epoch": 0.14638699758401055, + "grad_norm": 2.3399503231048584, + "learning_rate": 7.310647639956093e-07, + "loss": 1.0965, + "mean_token_accuracy": 0.6747813820838928, + "num_tokens": 34001441.0, + "step": 1333 + }, + { + "epoch": 0.1464968152866242, + "grad_norm": 2.398972988128662, + "learning_rate": 7.316136114160263e-07, + "loss": 1.0624, + "mean_token_accuracy": 0.6856744289398193, + "num_tokens": 34025972.0, + "step": 1334 + }, + { + "epoch": 0.14660663298923787, + "grad_norm": 2.1633858680725098, + "learning_rate": 7.321624588364435e-07, + "loss": 0.9917, + "mean_token_accuracy": 0.7038466334342957, + "num_tokens": 34053782.0, + "step": 1335 + }, + { + "epoch": 0.14671645069185152, + "grad_norm": 2.209078788757324, + "learning_rate": 7.327113062568605e-07, + "loss": 1.1345, + "mean_token_accuracy": 0.6672442555427551, + "num_tokens": 34082095.0, + "step": 1336 + }, + { + "epoch": 0.1468262683944652, + "grad_norm": 2.615291118621826, + "learning_rate": 7.332601536772777e-07, + "loss": 1.1029, + "mean_token_accuracy": 0.6802607774734497, + "num_tokens": 34104468.0, + "step": 1337 + }, + { + "epoch": 0.14693608609707884, + "grad_norm": 2.288522243499756, + "learning_rate": 7.338090010976949e-07, + "loss": 1.0385, + "mean_token_accuracy": 0.6904780864715576, + "num_tokens": 34130477.0, + "step": 1338 + }, + { + "epoch": 0.1470459037996925, + "grad_norm": 2.357867956161499, + "learning_rate": 7.343578485181118e-07, + "loss": 1.0881, + "mean_token_accuracy": 0.6776160001754761, + "num_tokens": 34154966.0, + "step": 1339 + }, + { + "epoch": 0.14715572150230616, + "grad_norm": 2.594458818435669, + "learning_rate": 7.34906695938529e-07, + "loss": 1.0771, + "mean_token_accuracy": 0.6748323440551758, + "num_tokens": 34175482.0, + "step": 1340 + }, + { + "epoch": 0.14726553920491983, + "grad_norm": 2.296968698501587, + "learning_rate": 7.354555433589462e-07, + "loss": 0.9312, + "mean_token_accuracy": 0.7133293747901917, + "num_tokens": 34197416.0, + "step": 1341 + }, + { + "epoch": 0.1473753569075335, + "grad_norm": 2.2323505878448486, + "learning_rate": 7.360043907793632e-07, + "loss": 1.0524, + "mean_token_accuracy": 0.6839545369148254, + "num_tokens": 34223430.0, + "step": 1342 + }, + { + "epoch": 0.14748517461014715, + "grad_norm": 2.223198413848877, + "learning_rate": 7.365532381997804e-07, + "loss": 1.0995, + "mean_token_accuracy": 0.6689292788505554, + "num_tokens": 34251652.0, + "step": 1343 + }, + { + "epoch": 0.14759499231276083, + "grad_norm": 2.4287893772125244, + "learning_rate": 7.371020856201976e-07, + "loss": 0.9303, + "mean_token_accuracy": 0.7111620306968689, + "num_tokens": 34273242.0, + "step": 1344 + }, + { + "epoch": 0.14770481001537447, + "grad_norm": 2.249608278274536, + "learning_rate": 7.376509330406146e-07, + "loss": 1.0105, + "mean_token_accuracy": 0.7024099826812744, + "num_tokens": 34300722.0, + "step": 1345 + }, + { + "epoch": 0.14781462771798815, + "grad_norm": 2.4161410331726074, + "learning_rate": 7.381997804610318e-07, + "loss": 1.0632, + "mean_token_accuracy": 0.694229006767273, + "num_tokens": 34324309.0, + "step": 1346 + }, + { + "epoch": 0.1479244454206018, + "grad_norm": 2.3630104064941406, + "learning_rate": 7.387486278814489e-07, + "loss": 1.0455, + "mean_token_accuracy": 0.689639687538147, + "num_tokens": 34348619.0, + "step": 1347 + }, + { + "epoch": 0.14803426312321546, + "grad_norm": 2.3949594497680664, + "learning_rate": 7.39297475301866e-07, + "loss": 1.0894, + "mean_token_accuracy": 0.6762447357177734, + "num_tokens": 34373768.0, + "step": 1348 + }, + { + "epoch": 0.1481440808258291, + "grad_norm": 2.3255960941314697, + "learning_rate": 7.398463227222832e-07, + "loss": 1.0404, + "mean_token_accuracy": 0.6885454654693604, + "num_tokens": 34399377.0, + "step": 1349 + }, + { + "epoch": 0.14825389852844278, + "grad_norm": 2.5491697788238525, + "learning_rate": 7.403951701427003e-07, + "loss": 1.0148, + "mean_token_accuracy": 0.6929913759231567, + "num_tokens": 34421590.0, + "step": 1350 + }, + { + "epoch": 0.14836371623105646, + "grad_norm": 2.200634717941284, + "learning_rate": 7.409440175631174e-07, + "loss": 1.0941, + "mean_token_accuracy": 0.6794909238815308, + "num_tokens": 34449882.0, + "step": 1351 + }, + { + "epoch": 0.1484735339336701, + "grad_norm": 2.5237748622894287, + "learning_rate": 7.414928649835346e-07, + "loss": 0.9814, + "mean_token_accuracy": 0.7021857500076294, + "num_tokens": 34470644.0, + "step": 1352 + }, + { + "epoch": 0.14858335163628378, + "grad_norm": 2.187809944152832, + "learning_rate": 7.420417124039517e-07, + "loss": 1.0071, + "mean_token_accuracy": 0.6918795704841614, + "num_tokens": 34498319.0, + "step": 1353 + }, + { + "epoch": 0.14869316933889742, + "grad_norm": 2.361701488494873, + "learning_rate": 7.425905598243688e-07, + "loss": 0.969, + "mean_token_accuracy": 0.7052311897277832, + "num_tokens": 34520396.0, + "step": 1354 + }, + { + "epoch": 0.1488029870415111, + "grad_norm": 2.4715845584869385, + "learning_rate": 7.43139407244786e-07, + "loss": 1.0736, + "mean_token_accuracy": 0.6897517442703247, + "num_tokens": 34543788.0, + "step": 1355 + }, + { + "epoch": 0.14891280474412474, + "grad_norm": 2.3259873390197754, + "learning_rate": 7.436882546652031e-07, + "loss": 0.9785, + "mean_token_accuracy": 0.7034205198287964, + "num_tokens": 34569166.0, + "step": 1356 + }, + { + "epoch": 0.14902262244673842, + "grad_norm": 2.1584582328796387, + "learning_rate": 7.442371020856202e-07, + "loss": 1.0212, + "mean_token_accuracy": 0.694300651550293, + "num_tokens": 34598254.0, + "step": 1357 + }, + { + "epoch": 0.14913244014935206, + "grad_norm": 2.4257397651672363, + "learning_rate": 7.447859495060372e-07, + "loss": 1.06, + "mean_token_accuracy": 0.6892149448394775, + "num_tokens": 34621068.0, + "step": 1358 + }, + { + "epoch": 0.14924225785196574, + "grad_norm": 2.154474973678589, + "learning_rate": 7.453347969264544e-07, + "loss": 0.9971, + "mean_token_accuracy": 0.6939302682876587, + "num_tokens": 34647601.0, + "step": 1359 + }, + { + "epoch": 0.1493520755545794, + "grad_norm": 2.3651390075683594, + "learning_rate": 7.458836443468715e-07, + "loss": 1.0352, + "mean_token_accuracy": 0.6933432817459106, + "num_tokens": 34672289.0, + "step": 1360 + }, + { + "epoch": 0.14946189325719306, + "grad_norm": 2.216979742050171, + "learning_rate": 7.464324917672886e-07, + "loss": 1.0413, + "mean_token_accuracy": 0.6851420402526855, + "num_tokens": 34701496.0, + "step": 1361 + }, + { + "epoch": 0.14957171095980673, + "grad_norm": 2.371177911758423, + "learning_rate": 7.469813391877058e-07, + "loss": 1.0714, + "mean_token_accuracy": 0.6788146495819092, + "num_tokens": 34727649.0, + "step": 1362 + }, + { + "epoch": 0.14968152866242038, + "grad_norm": 2.0880298614501953, + "learning_rate": 7.475301866081229e-07, + "loss": 1.093, + "mean_token_accuracy": 0.6753327250480652, + "num_tokens": 34757423.0, + "step": 1363 + }, + { + "epoch": 0.14979134636503405, + "grad_norm": 2.1579749584198, + "learning_rate": 7.4807903402854e-07, + "loss": 0.9825, + "mean_token_accuracy": 0.7048391103744507, + "num_tokens": 34786794.0, + "step": 1364 + }, + { + "epoch": 0.1499011640676477, + "grad_norm": 2.081482410430908, + "learning_rate": 7.486278814489572e-07, + "loss": 1.0513, + "mean_token_accuracy": 0.684467613697052, + "num_tokens": 34817333.0, + "step": 1365 + }, + { + "epoch": 0.15001098177026137, + "grad_norm": 2.2055742740631104, + "learning_rate": 7.491767288693743e-07, + "loss": 1.1185, + "mean_token_accuracy": 0.6784387230873108, + "num_tokens": 34846485.0, + "step": 1366 + }, + { + "epoch": 0.15012079947287502, + "grad_norm": 2.4478964805603027, + "learning_rate": 7.497255762897914e-07, + "loss": 0.9766, + "mean_token_accuracy": 0.7045155763626099, + "num_tokens": 34869509.0, + "step": 1367 + }, + { + "epoch": 0.1502306171754887, + "grad_norm": 2.2401885986328125, + "learning_rate": 7.502744237102086e-07, + "loss": 1.0182, + "mean_token_accuracy": 0.6985120177268982, + "num_tokens": 34897314.0, + "step": 1368 + }, + { + "epoch": 0.15034043487810236, + "grad_norm": 2.4769203662872314, + "learning_rate": 7.508232711306256e-07, + "loss": 1.0507, + "mean_token_accuracy": 0.6849870085716248, + "num_tokens": 34920730.0, + "step": 1369 + }, + { + "epoch": 0.150450252580716, + "grad_norm": 2.3308093547821045, + "learning_rate": 7.513721185510428e-07, + "loss": 1.0718, + "mean_token_accuracy": 0.6793447136878967, + "num_tokens": 34948227.0, + "step": 1370 + }, + { + "epoch": 0.15056007028332968, + "grad_norm": 2.349902629852295, + "learning_rate": 7.5192096597146e-07, + "loss": 1.0073, + "mean_token_accuracy": 0.702282190322876, + "num_tokens": 34972844.0, + "step": 1371 + }, + { + "epoch": 0.15066988798594333, + "grad_norm": 2.39182710647583, + "learning_rate": 7.52469813391877e-07, + "loss": 1.0394, + "mean_token_accuracy": 0.6923153400421143, + "num_tokens": 34996844.0, + "step": 1372 + }, + { + "epoch": 0.150779705688557, + "grad_norm": 2.238847255706787, + "learning_rate": 7.530186608122942e-07, + "loss": 1.1438, + "mean_token_accuracy": 0.6677078008651733, + "num_tokens": 35025814.0, + "step": 1373 + }, + { + "epoch": 0.15088952339117065, + "grad_norm": 2.6918296813964844, + "learning_rate": 7.535675082327113e-07, + "loss": 1.0381, + "mean_token_accuracy": 0.6890278458595276, + "num_tokens": 35046924.0, + "step": 1374 + }, + { + "epoch": 0.15099934109378432, + "grad_norm": 2.449551820755005, + "learning_rate": 7.541163556531284e-07, + "loss": 0.9995, + "mean_token_accuracy": 0.7015043497085571, + "num_tokens": 35069655.0, + "step": 1375 + }, + { + "epoch": 0.15110915879639797, + "grad_norm": 2.481668710708618, + "learning_rate": 7.546652030735456e-07, + "loss": 1.1148, + "mean_token_accuracy": 0.6856927871704102, + "num_tokens": 35093613.0, + "step": 1376 + }, + { + "epoch": 0.15121897649901164, + "grad_norm": 2.3056046962738037, + "learning_rate": 7.552140504939627e-07, + "loss": 0.9739, + "mean_token_accuracy": 0.7017896771430969, + "num_tokens": 35117207.0, + "step": 1377 + }, + { + "epoch": 0.1513287942016253, + "grad_norm": 2.2048959732055664, + "learning_rate": 7.557628979143797e-07, + "loss": 1.0781, + "mean_token_accuracy": 0.6837673783302307, + "num_tokens": 35143570.0, + "step": 1378 + }, + { + "epoch": 0.15143861190423896, + "grad_norm": 2.413033962249756, + "learning_rate": 7.563117453347969e-07, + "loss": 1.0725, + "mean_token_accuracy": 0.6833010911941528, + "num_tokens": 35166858.0, + "step": 1379 + }, + { + "epoch": 0.15154842960685264, + "grad_norm": 2.1039371490478516, + "learning_rate": 7.568605927552139e-07, + "loss": 1.0521, + "mean_token_accuracy": 0.6865894794464111, + "num_tokens": 35194482.0, + "step": 1380 + }, + { + "epoch": 0.15165824730946628, + "grad_norm": 2.4024009704589844, + "learning_rate": 7.574094401756311e-07, + "loss": 0.9727, + "mean_token_accuracy": 0.7064192295074463, + "num_tokens": 35218238.0, + "step": 1381 + }, + { + "epoch": 0.15176806501207996, + "grad_norm": 2.37304949760437, + "learning_rate": 7.579582875960483e-07, + "loss": 1.1278, + "mean_token_accuracy": 0.6626790761947632, + "num_tokens": 35241722.0, + "step": 1382 + }, + { + "epoch": 0.1518778827146936, + "grad_norm": 2.2737345695495605, + "learning_rate": 7.585071350164653e-07, + "loss": 1.0811, + "mean_token_accuracy": 0.6765503287315369, + "num_tokens": 35269514.0, + "step": 1383 + }, + { + "epoch": 0.15198770041730728, + "grad_norm": 2.243105173110962, + "learning_rate": 7.590559824368825e-07, + "loss": 0.9675, + "mean_token_accuracy": 0.7096331119537354, + "num_tokens": 35294044.0, + "step": 1384 + }, + { + "epoch": 0.15209751811992092, + "grad_norm": 2.1242830753326416, + "learning_rate": 7.596048298572997e-07, + "loss": 1.1156, + "mean_token_accuracy": 0.6679753661155701, + "num_tokens": 35323651.0, + "step": 1385 + }, + { + "epoch": 0.1522073358225346, + "grad_norm": 2.4899444580078125, + "learning_rate": 7.601536772777167e-07, + "loss": 1.0116, + "mean_token_accuracy": 0.6944916844367981, + "num_tokens": 35347824.0, + "step": 1386 + }, + { + "epoch": 0.15231715352514824, + "grad_norm": 2.0220706462860107, + "learning_rate": 7.607025246981339e-07, + "loss": 1.088, + "mean_token_accuracy": 0.678504467010498, + "num_tokens": 35379043.0, + "step": 1387 + }, + { + "epoch": 0.15242697122776191, + "grad_norm": 2.416546583175659, + "learning_rate": 7.612513721185511e-07, + "loss": 1.0069, + "mean_token_accuracy": 0.7114831805229187, + "num_tokens": 35401816.0, + "step": 1388 + }, + { + "epoch": 0.1525367889303756, + "grad_norm": 2.441318988800049, + "learning_rate": 7.618002195389681e-07, + "loss": 1.0478, + "mean_token_accuracy": 0.6925957202911377, + "num_tokens": 35426627.0, + "step": 1389 + }, + { + "epoch": 0.15264660663298923, + "grad_norm": 2.271779775619507, + "learning_rate": 7.623490669593853e-07, + "loss": 1.0546, + "mean_token_accuracy": 0.6860817670822144, + "num_tokens": 35454475.0, + "step": 1390 + }, + { + "epoch": 0.1527564243356029, + "grad_norm": 2.4414119720458984, + "learning_rate": 7.628979143798024e-07, + "loss": 1.0214, + "mean_token_accuracy": 0.6929366588592529, + "num_tokens": 35477984.0, + "step": 1391 + }, + { + "epoch": 0.15286624203821655, + "grad_norm": 2.3580617904663086, + "learning_rate": 7.634467618002195e-07, + "loss": 1.0536, + "mean_token_accuracy": 0.6866611242294312, + "num_tokens": 35503755.0, + "step": 1392 + }, + { + "epoch": 0.15297605974083023, + "grad_norm": 2.439192295074463, + "learning_rate": 7.639956092206367e-07, + "loss": 1.0639, + "mean_token_accuracy": 0.683086633682251, + "num_tokens": 35527418.0, + "step": 1393 + }, + { + "epoch": 0.15308587744344387, + "grad_norm": 2.387373447418213, + "learning_rate": 7.645444566410538e-07, + "loss": 1.1168, + "mean_token_accuracy": 0.6695125102996826, + "num_tokens": 35552050.0, + "step": 1394 + }, + { + "epoch": 0.15319569514605755, + "grad_norm": 2.2005231380462646, + "learning_rate": 7.650933040614709e-07, + "loss": 0.9383, + "mean_token_accuracy": 0.7272710800170898, + "num_tokens": 35578085.0, + "step": 1395 + }, + { + "epoch": 0.1533055128486712, + "grad_norm": 2.144294500350952, + "learning_rate": 7.656421514818881e-07, + "loss": 1.0468, + "mean_token_accuracy": 0.6909595727920532, + "num_tokens": 35607045.0, + "step": 1396 + }, + { + "epoch": 0.15341533055128487, + "grad_norm": 2.3147332668304443, + "learning_rate": 7.66190998902305e-07, + "loss": 1.02, + "mean_token_accuracy": 0.7000957131385803, + "num_tokens": 35633349.0, + "step": 1397 + }, + { + "epoch": 0.15352514825389854, + "grad_norm": 1.9793964624404907, + "learning_rate": 7.667398463227222e-07, + "loss": 1.0352, + "mean_token_accuracy": 0.6876805424690247, + "num_tokens": 35664924.0, + "step": 1398 + }, + { + "epoch": 0.1536349659565122, + "grad_norm": 2.222186326980591, + "learning_rate": 7.672886937431394e-07, + "loss": 1.1296, + "mean_token_accuracy": 0.6647217273712158, + "num_tokens": 35693431.0, + "step": 1399 + }, + { + "epoch": 0.15374478365912586, + "grad_norm": 2.5319571495056152, + "learning_rate": 7.678375411635564e-07, + "loss": 1.043, + "mean_token_accuracy": 0.6889821290969849, + "num_tokens": 35715461.0, + "step": 1400 + }, + { + "epoch": 0.1538546013617395, + "grad_norm": 2.434109926223755, + "learning_rate": 7.683863885839736e-07, + "loss": 0.9971, + "mean_token_accuracy": 0.7024449110031128, + "num_tokens": 35737691.0, + "step": 1401 + }, + { + "epoch": 0.15396441906435318, + "grad_norm": 2.145463228225708, + "learning_rate": 7.689352360043907e-07, + "loss": 1.0463, + "mean_token_accuracy": 0.6840716004371643, + "num_tokens": 35766532.0, + "step": 1402 + }, + { + "epoch": 0.15407423676696683, + "grad_norm": 2.3059329986572266, + "learning_rate": 7.694840834248078e-07, + "loss": 1.0988, + "mean_token_accuracy": 0.6866449117660522, + "num_tokens": 35792052.0, + "step": 1403 + }, + { + "epoch": 0.1541840544695805, + "grad_norm": 2.403167724609375, + "learning_rate": 7.70032930845225e-07, + "loss": 1.0995, + "mean_token_accuracy": 0.6732248067855835, + "num_tokens": 35815134.0, + "step": 1404 + }, + { + "epoch": 0.15429387217219415, + "grad_norm": 2.173001527786255, + "learning_rate": 7.705817782656421e-07, + "loss": 0.967, + "mean_token_accuracy": 0.7081966996192932, + "num_tokens": 35841264.0, + "step": 1405 + }, + { + "epoch": 0.15440368987480782, + "grad_norm": 2.4381906986236572, + "learning_rate": 7.711306256860592e-07, + "loss": 1.0399, + "mean_token_accuracy": 0.6892980933189392, + "num_tokens": 35863751.0, + "step": 1406 + }, + { + "epoch": 0.1545135075774215, + "grad_norm": 2.6895008087158203, + "learning_rate": 7.716794731064764e-07, + "loss": 1.0087, + "mean_token_accuracy": 0.6963790655136108, + "num_tokens": 35885901.0, + "step": 1407 + }, + { + "epoch": 0.15462332528003514, + "grad_norm": 2.5654211044311523, + "learning_rate": 7.722283205268935e-07, + "loss": 1.0154, + "mean_token_accuracy": 0.7005674242973328, + "num_tokens": 35908269.0, + "step": 1408 + }, + { + "epoch": 0.1547331429826488, + "grad_norm": 2.4003031253814697, + "learning_rate": 7.727771679473106e-07, + "loss": 1.039, + "mean_token_accuracy": 0.6896732449531555, + "num_tokens": 35932589.0, + "step": 1409 + }, + { + "epoch": 0.15484296068526246, + "grad_norm": 2.2469873428344727, + "learning_rate": 7.733260153677278e-07, + "loss": 1.1144, + "mean_token_accuracy": 0.6784136295318604, + "num_tokens": 35959691.0, + "step": 1410 + }, + { + "epoch": 0.15495277838787613, + "grad_norm": 2.3883607387542725, + "learning_rate": 7.738748627881449e-07, + "loss": 1.0735, + "mean_token_accuracy": 0.6886520385742188, + "num_tokens": 35984241.0, + "step": 1411 + }, + { + "epoch": 0.15506259609048978, + "grad_norm": 2.3785500526428223, + "learning_rate": 7.74423710208562e-07, + "loss": 1.0255, + "mean_token_accuracy": 0.7029189467430115, + "num_tokens": 36010330.0, + "step": 1412 + }, + { + "epoch": 0.15517241379310345, + "grad_norm": 2.2267377376556396, + "learning_rate": 7.749725576289791e-07, + "loss": 1.0622, + "mean_token_accuracy": 0.679004430770874, + "num_tokens": 36036650.0, + "step": 1413 + }, + { + "epoch": 0.1552822314957171, + "grad_norm": 2.430182933807373, + "learning_rate": 7.755214050493963e-07, + "loss": 1.0488, + "mean_token_accuracy": 0.6873071193695068, + "num_tokens": 36058519.0, + "step": 1414 + }, + { + "epoch": 0.15539204919833077, + "grad_norm": 2.1765296459198, + "learning_rate": 7.760702524698134e-07, + "loss": 0.9408, + "mean_token_accuracy": 0.7123318910598755, + "num_tokens": 36083411.0, + "step": 1415 + }, + { + "epoch": 0.15550186690094442, + "grad_norm": 2.263545036315918, + "learning_rate": 7.766190998902304e-07, + "loss": 1.0046, + "mean_token_accuracy": 0.7051477432250977, + "num_tokens": 36108706.0, + "step": 1416 + }, + { + "epoch": 0.1556116846035581, + "grad_norm": 2.2634682655334473, + "learning_rate": 7.771679473106476e-07, + "loss": 0.965, + "mean_token_accuracy": 0.713206946849823, + "num_tokens": 36132754.0, + "step": 1417 + }, + { + "epoch": 0.15572150230617177, + "grad_norm": 2.2455403804779053, + "learning_rate": 7.777167947310647e-07, + "loss": 1.0758, + "mean_token_accuracy": 0.6876735091209412, + "num_tokens": 36159634.0, + "step": 1418 + }, + { + "epoch": 0.1558313200087854, + "grad_norm": 2.3546667098999023, + "learning_rate": 7.782656421514818e-07, + "loss": 0.9919, + "mean_token_accuracy": 0.7088002562522888, + "num_tokens": 36182269.0, + "step": 1419 + }, + { + "epoch": 0.15594113771139909, + "grad_norm": 2.0355441570281982, + "learning_rate": 7.78814489571899e-07, + "loss": 1.1106, + "mean_token_accuracy": 0.6730431914329529, + "num_tokens": 36216976.0, + "step": 1420 + }, + { + "epoch": 0.15605095541401273, + "grad_norm": 2.1344096660614014, + "learning_rate": 7.793633369923161e-07, + "loss": 1.0754, + "mean_token_accuracy": 0.6797240972518921, + "num_tokens": 36246479.0, + "step": 1421 + }, + { + "epoch": 0.1561607731166264, + "grad_norm": 2.5450518131256104, + "learning_rate": 7.799121844127332e-07, + "loss": 1.1319, + "mean_token_accuracy": 0.6659543514251709, + "num_tokens": 36269500.0, + "step": 1422 + }, + { + "epoch": 0.15627059081924005, + "grad_norm": 2.1773083209991455, + "learning_rate": 7.804610318331504e-07, + "loss": 1.1368, + "mean_token_accuracy": 0.6626147627830505, + "num_tokens": 36299397.0, + "step": 1423 + }, + { + "epoch": 0.15638040852185373, + "grad_norm": 2.4341788291931152, + "learning_rate": 7.810098792535674e-07, + "loss": 1.0233, + "mean_token_accuracy": 0.6943049430847168, + "num_tokens": 36323440.0, + "step": 1424 + }, + { + "epoch": 0.15649022622446737, + "grad_norm": 2.3842031955718994, + "learning_rate": 7.815587266739846e-07, + "loss": 1.035, + "mean_token_accuracy": 0.6975597739219666, + "num_tokens": 36347558.0, + "step": 1425 + }, + { + "epoch": 0.15660004392708105, + "grad_norm": 2.2383840084075928, + "learning_rate": 7.821075740944018e-07, + "loss": 1.0704, + "mean_token_accuracy": 0.6895748972892761, + "num_tokens": 36375092.0, + "step": 1426 + }, + { + "epoch": 0.15670986162969472, + "grad_norm": 2.0402815341949463, + "learning_rate": 7.826564215148188e-07, + "loss": 1.0842, + "mean_token_accuracy": 0.682105302810669, + "num_tokens": 36407431.0, + "step": 1427 + }, + { + "epoch": 0.15681967933230836, + "grad_norm": 2.385915517807007, + "learning_rate": 7.83205268935236e-07, + "loss": 1.0879, + "mean_token_accuracy": 0.6820539832115173, + "num_tokens": 36431061.0, + "step": 1428 + }, + { + "epoch": 0.15692949703492204, + "grad_norm": 2.1481106281280518, + "learning_rate": 7.837541163556532e-07, + "loss": 1.0067, + "mean_token_accuracy": 0.6984994411468506, + "num_tokens": 36458548.0, + "step": 1429 + }, + { + "epoch": 0.15703931473753568, + "grad_norm": 2.1504454612731934, + "learning_rate": 7.843029637760702e-07, + "loss": 0.9739, + "mean_token_accuracy": 0.7188218832015991, + "num_tokens": 36484821.0, + "step": 1430 + }, + { + "epoch": 0.15714913244014936, + "grad_norm": 2.413045644760132, + "learning_rate": 7.848518111964874e-07, + "loss": 0.9118, + "mean_token_accuracy": 0.7178229093551636, + "num_tokens": 36505728.0, + "step": 1431 + }, + { + "epoch": 0.157258950142763, + "grad_norm": 2.589125156402588, + "learning_rate": 7.854006586169045e-07, + "loss": 1.0339, + "mean_token_accuracy": 0.6879565715789795, + "num_tokens": 36526957.0, + "step": 1432 + }, + { + "epoch": 0.15736876784537668, + "grad_norm": 2.5087549686431885, + "learning_rate": 7.859495060373216e-07, + "loss": 1.112, + "mean_token_accuracy": 0.6742915511131287, + "num_tokens": 36551484.0, + "step": 1433 + }, + { + "epoch": 0.15747858554799032, + "grad_norm": 2.257699489593506, + "learning_rate": 7.864983534577388e-07, + "loss": 0.9665, + "mean_token_accuracy": 0.7083507776260376, + "num_tokens": 36577053.0, + "step": 1434 + }, + { + "epoch": 0.157588403250604, + "grad_norm": 2.5142509937286377, + "learning_rate": 7.870472008781557e-07, + "loss": 1.0326, + "mean_token_accuracy": 0.6960191130638123, + "num_tokens": 36601104.0, + "step": 1435 + }, + { + "epoch": 0.15769822095321767, + "grad_norm": 2.322221040725708, + "learning_rate": 7.875960482985729e-07, + "loss": 0.9427, + "mean_token_accuracy": 0.7209005355834961, + "num_tokens": 36624828.0, + "step": 1436 + }, + { + "epoch": 0.15780803865583132, + "grad_norm": 2.258134126663208, + "learning_rate": 7.881448957189901e-07, + "loss": 1.0946, + "mean_token_accuracy": 0.6709819436073303, + "num_tokens": 36653248.0, + "step": 1437 + }, + { + "epoch": 0.157917856358445, + "grad_norm": 2.1700940132141113, + "learning_rate": 7.886937431394071e-07, + "loss": 1.0822, + "mean_token_accuracy": 0.6744753122329712, + "num_tokens": 36682181.0, + "step": 1438 + }, + { + "epoch": 0.15802767406105864, + "grad_norm": 2.4097917079925537, + "learning_rate": 7.892425905598243e-07, + "loss": 0.9912, + "mean_token_accuracy": 0.6984603404998779, + "num_tokens": 36705927.0, + "step": 1439 + }, + { + "epoch": 0.1581374917636723, + "grad_norm": 2.6034092903137207, + "learning_rate": 7.897914379802415e-07, + "loss": 0.9291, + "mean_token_accuracy": 0.7194444537162781, + "num_tokens": 36729317.0, + "step": 1440 + }, + { + "epoch": 0.15824730946628596, + "grad_norm": 2.3654348850250244, + "learning_rate": 7.903402854006585e-07, + "loss": 0.9689, + "mean_token_accuracy": 0.7081182599067688, + "num_tokens": 36754064.0, + "step": 1441 + }, + { + "epoch": 0.15835712716889963, + "grad_norm": 2.5403385162353516, + "learning_rate": 7.908891328210757e-07, + "loss": 1.0609, + "mean_token_accuracy": 0.68746018409729, + "num_tokens": 36777840.0, + "step": 1442 + }, + { + "epoch": 0.15846694487151328, + "grad_norm": 2.4690041542053223, + "learning_rate": 7.914379802414928e-07, + "loss": 0.9131, + "mean_token_accuracy": 0.7211517095565796, + "num_tokens": 36800043.0, + "step": 1443 + }, + { + "epoch": 0.15857676257412695, + "grad_norm": 2.272028684616089, + "learning_rate": 7.919868276619099e-07, + "loss": 0.9439, + "mean_token_accuracy": 0.7106775045394897, + "num_tokens": 36829203.0, + "step": 1444 + }, + { + "epoch": 0.15868658027674062, + "grad_norm": 2.2912933826446533, + "learning_rate": 7.925356750823271e-07, + "loss": 1.0409, + "mean_token_accuracy": 0.6966907978057861, + "num_tokens": 36853566.0, + "step": 1445 + }, + { + "epoch": 0.15879639797935427, + "grad_norm": 1.9363689422607422, + "learning_rate": 7.930845225027442e-07, + "loss": 0.9819, + "mean_token_accuracy": 0.6993477940559387, + "num_tokens": 36884618.0, + "step": 1446 + }, + { + "epoch": 0.15890621568196794, + "grad_norm": 2.1924033164978027, + "learning_rate": 7.936333699231613e-07, + "loss": 1.0847, + "mean_token_accuracy": 0.6783944368362427, + "num_tokens": 36912722.0, + "step": 1447 + }, + { + "epoch": 0.1590160333845816, + "grad_norm": 2.5518743991851807, + "learning_rate": 7.941822173435785e-07, + "loss": 0.9986, + "mean_token_accuracy": 0.7011110782623291, + "num_tokens": 36934681.0, + "step": 1448 + }, + { + "epoch": 0.15912585108719526, + "grad_norm": 2.3291854858398438, + "learning_rate": 7.947310647639956e-07, + "loss": 1.0474, + "mean_token_accuracy": 0.6889116764068604, + "num_tokens": 36958912.0, + "step": 1449 + }, + { + "epoch": 0.1592356687898089, + "grad_norm": 2.3138585090637207, + "learning_rate": 7.952799121844127e-07, + "loss": 0.9667, + "mean_token_accuracy": 0.7078011631965637, + "num_tokens": 36983493.0, + "step": 1450 + }, + { + "epoch": 0.15934548649242258, + "grad_norm": 2.43939471244812, + "learning_rate": 7.958287596048299e-07, + "loss": 1.025, + "mean_token_accuracy": 0.6886082887649536, + "num_tokens": 37006043.0, + "step": 1451 + }, + { + "epoch": 0.15945530419503623, + "grad_norm": 2.1107163429260254, + "learning_rate": 7.96377607025247e-07, + "loss": 0.9652, + "mean_token_accuracy": 0.7110047936439514, + "num_tokens": 37034282.0, + "step": 1452 + }, + { + "epoch": 0.1595651218976499, + "grad_norm": 2.2979679107666016, + "learning_rate": 7.969264544456641e-07, + "loss": 1.0571, + "mean_token_accuracy": 0.68829745054245, + "num_tokens": 37057833.0, + "step": 1453 + }, + { + "epoch": 0.15967493960026355, + "grad_norm": 2.638279914855957, + "learning_rate": 7.974753018660811e-07, + "loss": 1.1267, + "mean_token_accuracy": 0.6659857034683228, + "num_tokens": 37080616.0, + "step": 1454 + }, + { + "epoch": 0.15978475730287722, + "grad_norm": 2.1159608364105225, + "learning_rate": 7.980241492864983e-07, + "loss": 1.0511, + "mean_token_accuracy": 0.6935356855392456, + "num_tokens": 37108677.0, + "step": 1455 + }, + { + "epoch": 0.1598945750054909, + "grad_norm": 2.2923829555511475, + "learning_rate": 7.985729967069154e-07, + "loss": 1.0318, + "mean_token_accuracy": 0.6870169043540955, + "num_tokens": 37136603.0, + "step": 1456 + }, + { + "epoch": 0.16000439270810454, + "grad_norm": 2.474074602127075, + "learning_rate": 7.991218441273325e-07, + "loss": 1.0002, + "mean_token_accuracy": 0.6939405202865601, + "num_tokens": 37160936.0, + "step": 1457 + }, + { + "epoch": 0.16011421041071822, + "grad_norm": 2.5076425075531006, + "learning_rate": 7.996706915477497e-07, + "loss": 1.0055, + "mean_token_accuracy": 0.7034525871276855, + "num_tokens": 37182756.0, + "step": 1458 + }, + { + "epoch": 0.16022402811333186, + "grad_norm": 2.1704678535461426, + "learning_rate": 8.002195389681668e-07, + "loss": 1.0046, + "mean_token_accuracy": 0.6991225481033325, + "num_tokens": 37211001.0, + "step": 1459 + }, + { + "epoch": 0.16033384581594554, + "grad_norm": 2.3211007118225098, + "learning_rate": 8.007683863885839e-07, + "loss": 1.0701, + "mean_token_accuracy": 0.689017117023468, + "num_tokens": 37237094.0, + "step": 1460 + }, + { + "epoch": 0.16044366351855918, + "grad_norm": 2.175337553024292, + "learning_rate": 8.013172338090011e-07, + "loss": 0.9787, + "mean_token_accuracy": 0.7109650373458862, + "num_tokens": 37263846.0, + "step": 1461 + }, + { + "epoch": 0.16055348122117286, + "grad_norm": 2.5293846130371094, + "learning_rate": 8.018660812294182e-07, + "loss": 1.096, + "mean_token_accuracy": 0.6833835244178772, + "num_tokens": 37286045.0, + "step": 1462 + }, + { + "epoch": 0.1606632989237865, + "grad_norm": 2.1995904445648193, + "learning_rate": 8.024149286498353e-07, + "loss": 0.9907, + "mean_token_accuracy": 0.7005220651626587, + "num_tokens": 37314732.0, + "step": 1463 + }, + { + "epoch": 0.16077311662640018, + "grad_norm": 2.192842960357666, + "learning_rate": 8.029637760702525e-07, + "loss": 1.0175, + "mean_token_accuracy": 0.701327919960022, + "num_tokens": 37342230.0, + "step": 1464 + }, + { + "epoch": 0.16088293432901385, + "grad_norm": 2.210026979446411, + "learning_rate": 8.035126234906695e-07, + "loss": 1.0348, + "mean_token_accuracy": 0.6861373782157898, + "num_tokens": 37370149.0, + "step": 1465 + }, + { + "epoch": 0.1609927520316275, + "grad_norm": 2.1233296394348145, + "learning_rate": 8.040614709110867e-07, + "loss": 1.0027, + "mean_token_accuracy": 0.7069189548492432, + "num_tokens": 37399918.0, + "step": 1466 + }, + { + "epoch": 0.16110256973424117, + "grad_norm": 2.289177656173706, + "learning_rate": 8.046103183315039e-07, + "loss": 0.9669, + "mean_token_accuracy": 0.7063785791397095, + "num_tokens": 37425820.0, + "step": 1467 + }, + { + "epoch": 0.16121238743685481, + "grad_norm": 2.3780319690704346, + "learning_rate": 8.051591657519209e-07, + "loss": 0.9929, + "mean_token_accuracy": 0.7011739611625671, + "num_tokens": 37450108.0, + "step": 1468 + }, + { + "epoch": 0.1613222051394685, + "grad_norm": 2.2233798503875732, + "learning_rate": 8.057080131723381e-07, + "loss": 1.144, + "mean_token_accuracy": 0.6584887504577637, + "num_tokens": 37479570.0, + "step": 1469 + }, + { + "epoch": 0.16143202284208213, + "grad_norm": 2.4481358528137207, + "learning_rate": 8.062568605927553e-07, + "loss": 1.0026, + "mean_token_accuracy": 0.6990314722061157, + "num_tokens": 37502238.0, + "step": 1470 + }, + { + "epoch": 0.1615418405446958, + "grad_norm": 2.4728994369506836, + "learning_rate": 8.068057080131723e-07, + "loss": 0.9823, + "mean_token_accuracy": 0.7026566863059998, + "num_tokens": 37523152.0, + "step": 1471 + }, + { + "epoch": 0.16165165824730945, + "grad_norm": 2.5555572509765625, + "learning_rate": 8.073545554335894e-07, + "loss": 0.9862, + "mean_token_accuracy": 0.7061547040939331, + "num_tokens": 37543956.0, + "step": 1472 + }, + { + "epoch": 0.16176147594992313, + "grad_norm": 2.206791877746582, + "learning_rate": 8.079034028540066e-07, + "loss": 1.123, + "mean_token_accuracy": 0.6679822206497192, + "num_tokens": 37575194.0, + "step": 1473 + }, + { + "epoch": 0.1618712936525368, + "grad_norm": 2.292071580886841, + "learning_rate": 8.084522502744236e-07, + "loss": 0.9949, + "mean_token_accuracy": 0.7012015581130981, + "num_tokens": 37603326.0, + "step": 1474 + }, + { + "epoch": 0.16198111135515045, + "grad_norm": 2.069775342941284, + "learning_rate": 8.090010976948408e-07, + "loss": 1.119, + "mean_token_accuracy": 0.6798419952392578, + "num_tokens": 37634672.0, + "step": 1475 + }, + { + "epoch": 0.16209092905776412, + "grad_norm": 2.395045280456543, + "learning_rate": 8.095499451152578e-07, + "loss": 1.1095, + "mean_token_accuracy": 0.6738231182098389, + "num_tokens": 37658704.0, + "step": 1476 + }, + { + "epoch": 0.16220074676037777, + "grad_norm": 2.7030160427093506, + "learning_rate": 8.10098792535675e-07, + "loss": 0.9516, + "mean_token_accuracy": 0.7126711010932922, + "num_tokens": 37678546.0, + "step": 1477 + }, + { + "epoch": 0.16231056446299144, + "grad_norm": 2.7089061737060547, + "learning_rate": 8.106476399560922e-07, + "loss": 1.0305, + "mean_token_accuracy": 0.6931569576263428, + "num_tokens": 37698863.0, + "step": 1478 + }, + { + "epoch": 0.1624203821656051, + "grad_norm": 2.2622592449188232, + "learning_rate": 8.111964873765092e-07, + "loss": 1.0226, + "mean_token_accuracy": 0.6942937970161438, + "num_tokens": 37723729.0, + "step": 1479 + }, + { + "epoch": 0.16253019986821876, + "grad_norm": 2.656311273574829, + "learning_rate": 8.117453347969264e-07, + "loss": 0.943, + "mean_token_accuracy": 0.7135013937950134, + "num_tokens": 37744073.0, + "step": 1480 + }, + { + "epoch": 0.1626400175708324, + "grad_norm": 2.378708600997925, + "learning_rate": 8.122941822173436e-07, + "loss": 1.0043, + "mean_token_accuracy": 0.6958428621292114, + "num_tokens": 37768235.0, + "step": 1481 + }, + { + "epoch": 0.16274983527344608, + "grad_norm": 2.095726728439331, + "learning_rate": 8.128430296377606e-07, + "loss": 1.1108, + "mean_token_accuracy": 0.6679208278656006, + "num_tokens": 37800582.0, + "step": 1482 + }, + { + "epoch": 0.16285965297605975, + "grad_norm": 2.258225202560425, + "learning_rate": 8.133918770581778e-07, + "loss": 0.9615, + "mean_token_accuracy": 0.708588719367981, + "num_tokens": 37826923.0, + "step": 1483 + }, + { + "epoch": 0.1629694706786734, + "grad_norm": 2.320775032043457, + "learning_rate": 8.13940724478595e-07, + "loss": 0.9868, + "mean_token_accuracy": 0.699543833732605, + "num_tokens": 37851860.0, + "step": 1484 + }, + { + "epoch": 0.16307928838128707, + "grad_norm": 2.43738055229187, + "learning_rate": 8.14489571899012e-07, + "loss": 1.0468, + "mean_token_accuracy": 0.6918892860412598, + "num_tokens": 37875313.0, + "step": 1485 + }, + { + "epoch": 0.16318910608390072, + "grad_norm": 2.4662210941314697, + "learning_rate": 8.150384193194292e-07, + "loss": 1.0874, + "mean_token_accuracy": 0.6849679946899414, + "num_tokens": 37900157.0, + "step": 1486 + }, + { + "epoch": 0.1632989237865144, + "grad_norm": 2.205334186553955, + "learning_rate": 8.155872667398463e-07, + "loss": 1.1296, + "mean_token_accuracy": 0.6834502220153809, + "num_tokens": 37927504.0, + "step": 1487 + }, + { + "epoch": 0.16340874148912804, + "grad_norm": 2.9568986892700195, + "learning_rate": 8.161361141602634e-07, + "loss": 0.8668, + "mean_token_accuracy": 0.7327181100845337, + "num_tokens": 37944487.0, + "step": 1488 + }, + { + "epoch": 0.1635185591917417, + "grad_norm": 2.431356430053711, + "learning_rate": 8.166849615806806e-07, + "loss": 1.0643, + "mean_token_accuracy": 0.6822175979614258, + "num_tokens": 37969415.0, + "step": 1489 + }, + { + "epoch": 0.16362837689435536, + "grad_norm": 2.2373616695404053, + "learning_rate": 8.172338090010977e-07, + "loss": 1.0976, + "mean_token_accuracy": 0.6779497265815735, + "num_tokens": 38000347.0, + "step": 1490 + }, + { + "epoch": 0.16373819459696903, + "grad_norm": 2.512568235397339, + "learning_rate": 8.177826564215147e-07, + "loss": 1.0864, + "mean_token_accuracy": 0.690385103225708, + "num_tokens": 38022645.0, + "step": 1491 + }, + { + "epoch": 0.16384801229958268, + "grad_norm": 2.250359058380127, + "learning_rate": 8.183315038419319e-07, + "loss": 1.0109, + "mean_token_accuracy": 0.7000926733016968, + "num_tokens": 38048961.0, + "step": 1492 + }, + { + "epoch": 0.16395783000219635, + "grad_norm": 2.483074426651001, + "learning_rate": 8.18880351262349e-07, + "loss": 0.8883, + "mean_token_accuracy": 0.7291032671928406, + "num_tokens": 38069776.0, + "step": 1493 + }, + { + "epoch": 0.16406764770481003, + "grad_norm": 2.4231789112091064, + "learning_rate": 8.194291986827661e-07, + "loss": 1.052, + "mean_token_accuracy": 0.6898403167724609, + "num_tokens": 38094486.0, + "step": 1494 + }, + { + "epoch": 0.16417746540742367, + "grad_norm": 2.6460561752319336, + "learning_rate": 8.199780461031833e-07, + "loss": 0.9529, + "mean_token_accuracy": 0.7113127708435059, + "num_tokens": 38116797.0, + "step": 1495 + }, + { + "epoch": 0.16428728311003735, + "grad_norm": 2.12825083732605, + "learning_rate": 8.205268935236004e-07, + "loss": 1.008, + "mean_token_accuracy": 0.6954137086868286, + "num_tokens": 38145321.0, + "step": 1496 + }, + { + "epoch": 0.164397100812651, + "grad_norm": 2.6880905628204346, + "learning_rate": 8.210757409440175e-07, + "loss": 1.0742, + "mean_token_accuracy": 0.6792540550231934, + "num_tokens": 38167016.0, + "step": 1497 + }, + { + "epoch": 0.16450691851526467, + "grad_norm": 2.48612642288208, + "learning_rate": 8.216245883644346e-07, + "loss": 1.0584, + "mean_token_accuracy": 0.6875435709953308, + "num_tokens": 38190281.0, + "step": 1498 + }, + { + "epoch": 0.1646167362178783, + "grad_norm": 2.308637857437134, + "learning_rate": 8.221734357848518e-07, + "loss": 1.0772, + "mean_token_accuracy": 0.685853123664856, + "num_tokens": 38215282.0, + "step": 1499 + }, + { + "epoch": 0.16472655392049199, + "grad_norm": 2.3050999641418457, + "learning_rate": 8.227222832052689e-07, + "loss": 0.9584, + "mean_token_accuracy": 0.7170395851135254, + "num_tokens": 38238432.0, + "step": 1500 + }, + { + "epoch": 0.16483637162310563, + "grad_norm": 2.1268980503082275, + "learning_rate": 8.23271130625686e-07, + "loss": 1.0443, + "mean_token_accuracy": 0.6906695365905762, + "num_tokens": 38267307.0, + "step": 1501 + }, + { + "epoch": 0.1649461893257193, + "grad_norm": 2.2509396076202393, + "learning_rate": 8.238199780461032e-07, + "loss": 0.9674, + "mean_token_accuracy": 0.7076799273490906, + "num_tokens": 38292272.0, + "step": 1502 + }, + { + "epoch": 0.16505600702833298, + "grad_norm": 2.377654552459717, + "learning_rate": 8.243688254665203e-07, + "loss": 0.8407, + "mean_token_accuracy": 0.7362675666809082, + "num_tokens": 38312638.0, + "step": 1503 + }, + { + "epoch": 0.16516582473094663, + "grad_norm": 2.4500396251678467, + "learning_rate": 8.249176728869374e-07, + "loss": 0.964, + "mean_token_accuracy": 0.720169186592102, + "num_tokens": 38334250.0, + "step": 1504 + }, + { + "epoch": 0.1652756424335603, + "grad_norm": 2.466402769088745, + "learning_rate": 8.254665203073546e-07, + "loss": 0.939, + "mean_token_accuracy": 0.7166883945465088, + "num_tokens": 38356584.0, + "step": 1505 + }, + { + "epoch": 0.16538546013617395, + "grad_norm": 2.43768572807312, + "learning_rate": 8.260153677277717e-07, + "loss": 0.9629, + "mean_token_accuracy": 0.7077465057373047, + "num_tokens": 38378279.0, + "step": 1506 + }, + { + "epoch": 0.16549527783878762, + "grad_norm": 2.5852131843566895, + "learning_rate": 8.265642151481888e-07, + "loss": 0.997, + "mean_token_accuracy": 0.6979437470436096, + "num_tokens": 38399213.0, + "step": 1507 + }, + { + "epoch": 0.16560509554140126, + "grad_norm": 2.4286930561065674, + "learning_rate": 8.27113062568606e-07, + "loss": 0.9516, + "mean_token_accuracy": 0.7117777466773987, + "num_tokens": 38421699.0, + "step": 1508 + }, + { + "epoch": 0.16571491324401494, + "grad_norm": 2.5245490074157715, + "learning_rate": 8.27661909989023e-07, + "loss": 1.0433, + "mean_token_accuracy": 0.7002484798431396, + "num_tokens": 38444071.0, + "step": 1509 + }, + { + "epoch": 0.16582473094662858, + "grad_norm": 2.429050922393799, + "learning_rate": 8.282107574094401e-07, + "loss": 1.0325, + "mean_token_accuracy": 0.6879536509513855, + "num_tokens": 38467031.0, + "step": 1510 + }, + { + "epoch": 0.16593454864924226, + "grad_norm": 2.412946939468384, + "learning_rate": 8.287596048298572e-07, + "loss": 1.0047, + "mean_token_accuracy": 0.6956976652145386, + "num_tokens": 38490729.0, + "step": 1511 + }, + { + "epoch": 0.16604436635185593, + "grad_norm": 2.252845525741577, + "learning_rate": 8.293084522502743e-07, + "loss": 1.0487, + "mean_token_accuracy": 0.6897742748260498, + "num_tokens": 38515023.0, + "step": 1512 + }, + { + "epoch": 0.16615418405446958, + "grad_norm": 2.38234543800354, + "learning_rate": 8.298572996706915e-07, + "loss": 1.0048, + "mean_token_accuracy": 0.7089638710021973, + "num_tokens": 38539878.0, + "step": 1513 + }, + { + "epoch": 0.16626400175708325, + "grad_norm": 2.2392868995666504, + "learning_rate": 8.304061470911086e-07, + "loss": 1.0934, + "mean_token_accuracy": 0.6731115579605103, + "num_tokens": 38566500.0, + "step": 1514 + }, + { + "epoch": 0.1663738194596969, + "grad_norm": 2.441378593444824, + "learning_rate": 8.309549945115257e-07, + "loss": 1.0096, + "mean_token_accuracy": 0.7009507417678833, + "num_tokens": 38589440.0, + "step": 1515 + }, + { + "epoch": 0.16648363716231057, + "grad_norm": 2.1875662803649902, + "learning_rate": 8.315038419319429e-07, + "loss": 0.9062, + "mean_token_accuracy": 0.7314031720161438, + "num_tokens": 38613831.0, + "step": 1516 + }, + { + "epoch": 0.16659345486492422, + "grad_norm": 2.0460619926452637, + "learning_rate": 8.3205268935236e-07, + "loss": 1.036, + "mean_token_accuracy": 0.6960186958312988, + "num_tokens": 38644436.0, + "step": 1517 + }, + { + "epoch": 0.1667032725675379, + "grad_norm": 2.5301029682159424, + "learning_rate": 8.326015367727771e-07, + "loss": 1.113, + "mean_token_accuracy": 0.66834557056427, + "num_tokens": 38666335.0, + "step": 1518 + }, + { + "epoch": 0.16681309027015154, + "grad_norm": 2.368441104888916, + "learning_rate": 8.331503841931943e-07, + "loss": 1.0914, + "mean_token_accuracy": 0.6792005896568298, + "num_tokens": 38690150.0, + "step": 1519 + }, + { + "epoch": 0.1669229079727652, + "grad_norm": 2.244135618209839, + "learning_rate": 8.336992316136113e-07, + "loss": 1.0503, + "mean_token_accuracy": 0.6883647441864014, + "num_tokens": 38716910.0, + "step": 1520 + }, + { + "epoch": 0.16703272567537888, + "grad_norm": 2.844050407409668, + "learning_rate": 8.342480790340285e-07, + "loss": 0.9653, + "mean_token_accuracy": 0.7108055949211121, + "num_tokens": 38736241.0, + "step": 1521 + }, + { + "epoch": 0.16714254337799253, + "grad_norm": 2.2512412071228027, + "learning_rate": 8.347969264544457e-07, + "loss": 1.0464, + "mean_token_accuracy": 0.6877880096435547, + "num_tokens": 38763416.0, + "step": 1522 + }, + { + "epoch": 0.1672523610806062, + "grad_norm": 2.3135228157043457, + "learning_rate": 8.353457738748627e-07, + "loss": 1.0313, + "mean_token_accuracy": 0.694653332233429, + "num_tokens": 38787462.0, + "step": 1523 + }, + { + "epoch": 0.16736217878321985, + "grad_norm": 2.2459802627563477, + "learning_rate": 8.358946212952799e-07, + "loss": 0.9971, + "mean_token_accuracy": 0.7101995944976807, + "num_tokens": 38813932.0, + "step": 1524 + }, + { + "epoch": 0.16747199648583352, + "grad_norm": 2.0397486686706543, + "learning_rate": 8.364434687156971e-07, + "loss": 1.0523, + "mean_token_accuracy": 0.6845923662185669, + "num_tokens": 38845417.0, + "step": 1525 + }, + { + "epoch": 0.16758181418844717, + "grad_norm": 2.239753484725952, + "learning_rate": 8.369923161361141e-07, + "loss": 1.0132, + "mean_token_accuracy": 0.6974314451217651, + "num_tokens": 38870295.0, + "step": 1526 + }, + { + "epoch": 0.16769163189106084, + "grad_norm": 2.2087552547454834, + "learning_rate": 8.375411635565313e-07, + "loss": 1.0115, + "mean_token_accuracy": 0.6952781081199646, + "num_tokens": 38896938.0, + "step": 1527 + }, + { + "epoch": 0.1678014495936745, + "grad_norm": 2.3108527660369873, + "learning_rate": 8.380900109769485e-07, + "loss": 0.9855, + "mean_token_accuracy": 0.7069202065467834, + "num_tokens": 38921841.0, + "step": 1528 + }, + { + "epoch": 0.16791126729628816, + "grad_norm": 2.3256592750549316, + "learning_rate": 8.386388583973654e-07, + "loss": 1.0569, + "mean_token_accuracy": 0.6968789100646973, + "num_tokens": 38947523.0, + "step": 1529 + }, + { + "epoch": 0.1680210849989018, + "grad_norm": 2.2826123237609863, + "learning_rate": 8.391877058177826e-07, + "loss": 1.1062, + "mean_token_accuracy": 0.6743258237838745, + "num_tokens": 38974891.0, + "step": 1530 + }, + { + "epoch": 0.16813090270151548, + "grad_norm": 2.1126229763031006, + "learning_rate": 8.397365532381997e-07, + "loss": 0.9164, + "mean_token_accuracy": 0.7305420637130737, + "num_tokens": 39001774.0, + "step": 1531 + }, + { + "epoch": 0.16824072040412916, + "grad_norm": 2.6228604316711426, + "learning_rate": 8.402854006586168e-07, + "loss": 1.051, + "mean_token_accuracy": 0.6842544078826904, + "num_tokens": 39022381.0, + "step": 1532 + }, + { + "epoch": 0.1683505381067428, + "grad_norm": 2.168761968612671, + "learning_rate": 8.40834248079034e-07, + "loss": 1.0747, + "mean_token_accuracy": 0.6793092489242554, + "num_tokens": 39052136.0, + "step": 1533 + }, + { + "epoch": 0.16846035580935648, + "grad_norm": 2.503444194793701, + "learning_rate": 8.413830954994511e-07, + "loss": 1.079, + "mean_token_accuracy": 0.6844319105148315, + "num_tokens": 39075902.0, + "step": 1534 + }, + { + "epoch": 0.16857017351197012, + "grad_norm": 2.6869454383850098, + "learning_rate": 8.419319429198682e-07, + "loss": 1.0448, + "mean_token_accuracy": 0.6874620318412781, + "num_tokens": 39098253.0, + "step": 1535 + }, + { + "epoch": 0.1686799912145838, + "grad_norm": 2.3871536254882812, + "learning_rate": 8.424807903402854e-07, + "loss": 0.9825, + "mean_token_accuracy": 0.7065322399139404, + "num_tokens": 39122298.0, + "step": 1536 + }, + { + "epoch": 0.16878980891719744, + "grad_norm": 2.3531289100646973, + "learning_rate": 8.430296377607025e-07, + "loss": 1.0229, + "mean_token_accuracy": 0.6938257217407227, + "num_tokens": 39147859.0, + "step": 1537 + }, + { + "epoch": 0.16889962661981112, + "grad_norm": 2.5364508628845215, + "learning_rate": 8.435784851811196e-07, + "loss": 1.0658, + "mean_token_accuracy": 0.678507387638092, + "num_tokens": 39170829.0, + "step": 1538 + }, + { + "epoch": 0.16900944432242476, + "grad_norm": 2.3845102787017822, + "learning_rate": 8.441273326015367e-07, + "loss": 1.0518, + "mean_token_accuracy": 0.6949501037597656, + "num_tokens": 39195908.0, + "step": 1539 + }, + { + "epoch": 0.16911926202503844, + "grad_norm": 2.1034059524536133, + "learning_rate": 8.446761800219539e-07, + "loss": 1.1169, + "mean_token_accuracy": 0.6650518774986267, + "num_tokens": 39226484.0, + "step": 1540 + }, + { + "epoch": 0.1692290797276521, + "grad_norm": 2.275341033935547, + "learning_rate": 8.45225027442371e-07, + "loss": 1.0935, + "mean_token_accuracy": 0.6759446263313293, + "num_tokens": 39252255.0, + "step": 1541 + }, + { + "epoch": 0.16933889743026576, + "grad_norm": 2.3162338733673096, + "learning_rate": 8.457738748627881e-07, + "loss": 1.0376, + "mean_token_accuracy": 0.6917005777359009, + "num_tokens": 39276094.0, + "step": 1542 + }, + { + "epoch": 0.16944871513287943, + "grad_norm": 2.4267852306365967, + "learning_rate": 8.463227222832053e-07, + "loss": 0.9686, + "mean_token_accuracy": 0.708162784576416, + "num_tokens": 39299153.0, + "step": 1543 + }, + { + "epoch": 0.16955853283549308, + "grad_norm": 2.254448652267456, + "learning_rate": 8.468715697036224e-07, + "loss": 1.0494, + "mean_token_accuracy": 0.6883390545845032, + "num_tokens": 39323694.0, + "step": 1544 + }, + { + "epoch": 0.16966835053810675, + "grad_norm": 2.3975090980529785, + "learning_rate": 8.474204171240395e-07, + "loss": 1.0199, + "mean_token_accuracy": 0.6977909803390503, + "num_tokens": 39346692.0, + "step": 1545 + }, + { + "epoch": 0.1697781682407204, + "grad_norm": 2.2252211570739746, + "learning_rate": 8.479692645444567e-07, + "loss": 0.9638, + "mean_token_accuracy": 0.706797182559967, + "num_tokens": 39370688.0, + "step": 1546 + }, + { + "epoch": 0.16988798594333407, + "grad_norm": 2.6138558387756348, + "learning_rate": 8.485181119648738e-07, + "loss": 1.0683, + "mean_token_accuracy": 0.6867484450340271, + "num_tokens": 39393029.0, + "step": 1547 + }, + { + "epoch": 0.16999780364594771, + "grad_norm": 2.196873664855957, + "learning_rate": 8.490669593852908e-07, + "loss": 0.9933, + "mean_token_accuracy": 0.7012841105461121, + "num_tokens": 39419360.0, + "step": 1548 + }, + { + "epoch": 0.1701076213485614, + "grad_norm": 2.2546863555908203, + "learning_rate": 8.496158068057079e-07, + "loss": 1.02, + "mean_token_accuracy": 0.7005600929260254, + "num_tokens": 39445874.0, + "step": 1549 + }, + { + "epoch": 0.17021743905117506, + "grad_norm": 2.34171199798584, + "learning_rate": 8.50164654226125e-07, + "loss": 0.9842, + "mean_token_accuracy": 0.7022943496704102, + "num_tokens": 39470557.0, + "step": 1550 + }, + { + "epoch": 0.1703272567537887, + "grad_norm": 2.662562608718872, + "learning_rate": 8.507135016465422e-07, + "loss": 1.0139, + "mean_token_accuracy": 0.6988925933837891, + "num_tokens": 39490369.0, + "step": 1551 + }, + { + "epoch": 0.17043707445640238, + "grad_norm": 2.374929666519165, + "learning_rate": 8.512623490669593e-07, + "loss": 1.0111, + "mean_token_accuracy": 0.7030456066131592, + "num_tokens": 39514022.0, + "step": 1552 + }, + { + "epoch": 0.17054689215901603, + "grad_norm": 2.25433611869812, + "learning_rate": 8.518111964873764e-07, + "loss": 1.0564, + "mean_token_accuracy": 0.6809617280960083, + "num_tokens": 39540570.0, + "step": 1553 + }, + { + "epoch": 0.1706567098616297, + "grad_norm": 2.069956064224243, + "learning_rate": 8.523600439077936e-07, + "loss": 0.9929, + "mean_token_accuracy": 0.6974359154701233, + "num_tokens": 39568902.0, + "step": 1554 + }, + { + "epoch": 0.17076652756424335, + "grad_norm": 2.0076279640197754, + "learning_rate": 8.529088913282107e-07, + "loss": 1.0901, + "mean_token_accuracy": 0.6729156970977783, + "num_tokens": 39602027.0, + "step": 1555 + }, + { + "epoch": 0.17087634526685702, + "grad_norm": 2.1273880004882812, + "learning_rate": 8.534577387486278e-07, + "loss": 1.0851, + "mean_token_accuracy": 0.6701798439025879, + "num_tokens": 39631919.0, + "step": 1556 + }, + { + "epoch": 0.17098616296947067, + "grad_norm": 2.1154160499572754, + "learning_rate": 8.54006586169045e-07, + "loss": 0.9881, + "mean_token_accuracy": 0.6985658407211304, + "num_tokens": 39659962.0, + "step": 1557 + }, + { + "epoch": 0.17109598067208434, + "grad_norm": 2.178969144821167, + "learning_rate": 8.545554335894621e-07, + "loss": 1.1236, + "mean_token_accuracy": 0.668639063835144, + "num_tokens": 39689278.0, + "step": 1558 + }, + { + "epoch": 0.17120579837469801, + "grad_norm": 2.2143263816833496, + "learning_rate": 8.551042810098792e-07, + "loss": 1.0701, + "mean_token_accuracy": 0.6815639734268188, + "num_tokens": 39718267.0, + "step": 1559 + }, + { + "epoch": 0.17131561607731166, + "grad_norm": 2.328348159790039, + "learning_rate": 8.556531284302964e-07, + "loss": 1.1379, + "mean_token_accuracy": 0.6677097678184509, + "num_tokens": 39745671.0, + "step": 1560 + }, + { + "epoch": 0.17142543377992533, + "grad_norm": 2.3722281455993652, + "learning_rate": 8.562019758507134e-07, + "loss": 1.0212, + "mean_token_accuracy": 0.7001415491104126, + "num_tokens": 39769162.0, + "step": 1561 + }, + { + "epoch": 0.17153525148253898, + "grad_norm": 2.2093560695648193, + "learning_rate": 8.567508232711306e-07, + "loss": 1.0637, + "mean_token_accuracy": 0.683884859085083, + "num_tokens": 39798627.0, + "step": 1562 + }, + { + "epoch": 0.17164506918515265, + "grad_norm": 2.3545596599578857, + "learning_rate": 8.572996706915478e-07, + "loss": 0.9636, + "mean_token_accuracy": 0.710594892501831, + "num_tokens": 39822228.0, + "step": 1563 + }, + { + "epoch": 0.1717548868877663, + "grad_norm": 2.6442999839782715, + "learning_rate": 8.578485181119648e-07, + "loss": 0.9343, + "mean_token_accuracy": 0.7138054370880127, + "num_tokens": 39840810.0, + "step": 1564 + }, + { + "epoch": 0.17186470459037997, + "grad_norm": 2.136512041091919, + "learning_rate": 8.58397365532382e-07, + "loss": 1.0025, + "mean_token_accuracy": 0.6971674561500549, + "num_tokens": 39870874.0, + "step": 1565 + }, + { + "epoch": 0.17197452229299362, + "grad_norm": 2.2571144104003906, + "learning_rate": 8.589462129527992e-07, + "loss": 1.0774, + "mean_token_accuracy": 0.6751494407653809, + "num_tokens": 39899120.0, + "step": 1566 + }, + { + "epoch": 0.1720843399956073, + "grad_norm": 2.061798334121704, + "learning_rate": 8.594950603732161e-07, + "loss": 1.0354, + "mean_token_accuracy": 0.6924315690994263, + "num_tokens": 39930879.0, + "step": 1567 + }, + { + "epoch": 0.17219415769822094, + "grad_norm": 2.492831230163574, + "learning_rate": 8.600439077936333e-07, + "loss": 0.957, + "mean_token_accuracy": 0.707391619682312, + "num_tokens": 39953166.0, + "step": 1568 + }, + { + "epoch": 0.1723039754008346, + "grad_norm": 2.2734134197235107, + "learning_rate": 8.605927552140505e-07, + "loss": 1.0009, + "mean_token_accuracy": 0.7001231908798218, + "num_tokens": 39978763.0, + "step": 1569 + }, + { + "epoch": 0.1724137931034483, + "grad_norm": 2.643470525741577, + "learning_rate": 8.611416026344675e-07, + "loss": 0.9606, + "mean_token_accuracy": 0.7047103643417358, + "num_tokens": 39997897.0, + "step": 1570 + }, + { + "epoch": 0.17252361080606193, + "grad_norm": 2.2227530479431152, + "learning_rate": 8.616904500548847e-07, + "loss": 1.0506, + "mean_token_accuracy": 0.6875470280647278, + "num_tokens": 40025173.0, + "step": 1571 + }, + { + "epoch": 0.1726334285086756, + "grad_norm": 2.178765058517456, + "learning_rate": 8.622392974753018e-07, + "loss": 0.9831, + "mean_token_accuracy": 0.7051206231117249, + "num_tokens": 40051302.0, + "step": 1572 + }, + { + "epoch": 0.17274324621128925, + "grad_norm": 2.1714348793029785, + "learning_rate": 8.627881448957189e-07, + "loss": 1.1039, + "mean_token_accuracy": 0.6729558706283569, + "num_tokens": 40082758.0, + "step": 1573 + }, + { + "epoch": 0.17285306391390293, + "grad_norm": 2.4331698417663574, + "learning_rate": 8.633369923161361e-07, + "loss": 1.0037, + "mean_token_accuracy": 0.6943647861480713, + "num_tokens": 40104554.0, + "step": 1574 + }, + { + "epoch": 0.17296288161651657, + "grad_norm": 2.046131134033203, + "learning_rate": 8.638858397365532e-07, + "loss": 0.9888, + "mean_token_accuracy": 0.7030996680259705, + "num_tokens": 40134148.0, + "step": 1575 + }, + { + "epoch": 0.17307269931913025, + "grad_norm": 2.682995080947876, + "learning_rate": 8.644346871569703e-07, + "loss": 0.9744, + "mean_token_accuracy": 0.7050656676292419, + "num_tokens": 40154408.0, + "step": 1576 + }, + { + "epoch": 0.1731825170217439, + "grad_norm": 2.327631950378418, + "learning_rate": 8.649835345773875e-07, + "loss": 1.0207, + "mean_token_accuracy": 0.6933890581130981, + "num_tokens": 40178471.0, + "step": 1577 + }, + { + "epoch": 0.17329233472435757, + "grad_norm": 2.4080166816711426, + "learning_rate": 8.655323819978046e-07, + "loss": 1.0143, + "mean_token_accuracy": 0.7041718363761902, + "num_tokens": 40206034.0, + "step": 1578 + }, + { + "epoch": 0.17340215242697124, + "grad_norm": 2.235563278198242, + "learning_rate": 8.660812294182217e-07, + "loss": 1.0144, + "mean_token_accuracy": 0.6872931718826294, + "num_tokens": 40232806.0, + "step": 1579 + }, + { + "epoch": 0.17351197012958489, + "grad_norm": 2.3970518112182617, + "learning_rate": 8.666300768386389e-07, + "loss": 0.9374, + "mean_token_accuracy": 0.7157453298568726, + "num_tokens": 40255701.0, + "step": 1580 + }, + { + "epoch": 0.17362178783219856, + "grad_norm": 2.329803943634033, + "learning_rate": 8.67178924259056e-07, + "loss": 1.0602, + "mean_token_accuracy": 0.6839625239372253, + "num_tokens": 40279975.0, + "step": 1581 + }, + { + "epoch": 0.1737316055348122, + "grad_norm": 2.473010540008545, + "learning_rate": 8.677277716794731e-07, + "loss": 0.9857, + "mean_token_accuracy": 0.6980183124542236, + "num_tokens": 40302202.0, + "step": 1582 + }, + { + "epoch": 0.17384142323742588, + "grad_norm": 2.1558613777160645, + "learning_rate": 8.682766190998902e-07, + "loss": 1.1185, + "mean_token_accuracy": 0.6739908456802368, + "num_tokens": 40334464.0, + "step": 1583 + }, + { + "epoch": 0.17395124094003953, + "grad_norm": 2.029632568359375, + "learning_rate": 8.688254665203073e-07, + "loss": 1.0543, + "mean_token_accuracy": 0.6865155100822449, + "num_tokens": 40367865.0, + "step": 1584 + }, + { + "epoch": 0.1740610586426532, + "grad_norm": 2.3286044597625732, + "learning_rate": 8.693743139407245e-07, + "loss": 1.1372, + "mean_token_accuracy": 0.6648807525634766, + "num_tokens": 40395667.0, + "step": 1585 + }, + { + "epoch": 0.17417087634526685, + "grad_norm": 2.40525484085083, + "learning_rate": 8.699231613611415e-07, + "loss": 1.014, + "mean_token_accuracy": 0.6977182626724243, + "num_tokens": 40419169.0, + "step": 1586 + }, + { + "epoch": 0.17428069404788052, + "grad_norm": 2.2524023056030273, + "learning_rate": 8.704720087815586e-07, + "loss": 1.0673, + "mean_token_accuracy": 0.6917927861213684, + "num_tokens": 40445633.0, + "step": 1587 + }, + { + "epoch": 0.1743905117504942, + "grad_norm": 2.437138080596924, + "learning_rate": 8.710208562019758e-07, + "loss": 1.0562, + "mean_token_accuracy": 0.7002856731414795, + "num_tokens": 40469831.0, + "step": 1588 + }, + { + "epoch": 0.17450032945310784, + "grad_norm": 2.167473316192627, + "learning_rate": 8.715697036223929e-07, + "loss": 1.0285, + "mean_token_accuracy": 0.6934788227081299, + "num_tokens": 40496789.0, + "step": 1589 + }, + { + "epoch": 0.1746101471557215, + "grad_norm": 2.3139114379882812, + "learning_rate": 8.7211855104281e-07, + "loss": 1.0191, + "mean_token_accuracy": 0.7010328769683838, + "num_tokens": 40521562.0, + "step": 1590 + }, + { + "epoch": 0.17471996485833516, + "grad_norm": 2.3556089401245117, + "learning_rate": 8.726673984632272e-07, + "loss": 0.9922, + "mean_token_accuracy": 0.7024954557418823, + "num_tokens": 40546758.0, + "step": 1591 + }, + { + "epoch": 0.17482978256094883, + "grad_norm": 2.2100517749786377, + "learning_rate": 8.732162458836443e-07, + "loss": 1.1028, + "mean_token_accuracy": 0.6779620051383972, + "num_tokens": 40574662.0, + "step": 1592 + }, + { + "epoch": 0.17493960026356248, + "grad_norm": 2.2229418754577637, + "learning_rate": 8.737650933040614e-07, + "loss": 1.0293, + "mean_token_accuracy": 0.6944108009338379, + "num_tokens": 40603469.0, + "step": 1593 + }, + { + "epoch": 0.17504941796617615, + "grad_norm": 2.4198787212371826, + "learning_rate": 8.743139407244785e-07, + "loss": 0.8996, + "mean_token_accuracy": 0.7195813655853271, + "num_tokens": 40624517.0, + "step": 1594 + }, + { + "epoch": 0.1751592356687898, + "grad_norm": 2.2890186309814453, + "learning_rate": 8.748627881448957e-07, + "loss": 1.0414, + "mean_token_accuracy": 0.6897562742233276, + "num_tokens": 40649874.0, + "step": 1595 + }, + { + "epoch": 0.17526905337140347, + "grad_norm": 2.5776195526123047, + "learning_rate": 8.754116355653128e-07, + "loss": 1.0421, + "mean_token_accuracy": 0.6845945119857788, + "num_tokens": 40671546.0, + "step": 1596 + }, + { + "epoch": 0.17537887107401715, + "grad_norm": 2.490032911300659, + "learning_rate": 8.759604829857299e-07, + "loss": 0.9631, + "mean_token_accuracy": 0.7061427235603333, + "num_tokens": 40694043.0, + "step": 1597 + }, + { + "epoch": 0.1754886887766308, + "grad_norm": 2.5583255290985107, + "learning_rate": 8.765093304061471e-07, + "loss": 1.0033, + "mean_token_accuracy": 0.6987941861152649, + "num_tokens": 40719431.0, + "step": 1598 + }, + { + "epoch": 0.17559850647924446, + "grad_norm": 2.0802505016326904, + "learning_rate": 8.770581778265642e-07, + "loss": 1.0482, + "mean_token_accuracy": 0.6873420476913452, + "num_tokens": 40748411.0, + "step": 1599 + }, + { + "epoch": 0.1757083241818581, + "grad_norm": 2.0186383724212646, + "learning_rate": 8.776070252469813e-07, + "loss": 0.9687, + "mean_token_accuracy": 0.7069237232208252, + "num_tokens": 40775921.0, + "step": 1600 + }, + { + "epoch": 0.17581814188447178, + "grad_norm": 2.296041250228882, + "learning_rate": 8.781558726673985e-07, + "loss": 1.0241, + "mean_token_accuracy": 0.6960130333900452, + "num_tokens": 40800247.0, + "step": 1601 + }, + { + "epoch": 0.17592795958708543, + "grad_norm": 2.7195091247558594, + "learning_rate": 8.787047200878156e-07, + "loss": 0.9685, + "mean_token_accuracy": 0.7077893018722534, + "num_tokens": 40818803.0, + "step": 1602 + }, + { + "epoch": 0.1760377772896991, + "grad_norm": 2.15861177444458, + "learning_rate": 8.792535675082327e-07, + "loss": 1.143, + "mean_token_accuracy": 0.6656126379966736, + "num_tokens": 40846606.0, + "step": 1603 + }, + { + "epoch": 0.17614759499231275, + "grad_norm": 2.2541775703430176, + "learning_rate": 8.798024149286499e-07, + "loss": 1.0322, + "mean_token_accuracy": 0.6949518918991089, + "num_tokens": 40872408.0, + "step": 1604 + }, + { + "epoch": 0.17625741269492642, + "grad_norm": 2.4060380458831787, + "learning_rate": 8.803512623490668e-07, + "loss": 0.9317, + "mean_token_accuracy": 0.717172384262085, + "num_tokens": 40895346.0, + "step": 1605 + }, + { + "epoch": 0.17636723039754007, + "grad_norm": 2.208195686340332, + "learning_rate": 8.80900109769484e-07, + "loss": 1.0109, + "mean_token_accuracy": 0.7070942521095276, + "num_tokens": 40922259.0, + "step": 1606 + }, + { + "epoch": 0.17647704810015374, + "grad_norm": 2.6329872608184814, + "learning_rate": 8.814489571899012e-07, + "loss": 1.0259, + "mean_token_accuracy": 0.6983315944671631, + "num_tokens": 40943434.0, + "step": 1607 + }, + { + "epoch": 0.17658686580276742, + "grad_norm": 2.5907790660858154, + "learning_rate": 8.819978046103182e-07, + "loss": 1.0077, + "mean_token_accuracy": 0.6975520253181458, + "num_tokens": 40965014.0, + "step": 1608 + }, + { + "epoch": 0.17669668350538106, + "grad_norm": 2.3912620544433594, + "learning_rate": 8.825466520307354e-07, + "loss": 1.0946, + "mean_token_accuracy": 0.6833634376525879, + "num_tokens": 40990410.0, + "step": 1609 + }, + { + "epoch": 0.17680650120799474, + "grad_norm": 2.7758517265319824, + "learning_rate": 8.830954994511526e-07, + "loss": 0.9851, + "mean_token_accuracy": 0.7045520544052124, + "num_tokens": 41008643.0, + "step": 1610 + }, + { + "epoch": 0.17691631891060838, + "grad_norm": 2.136894702911377, + "learning_rate": 8.836443468715696e-07, + "loss": 1.0552, + "mean_token_accuracy": 0.6826162934303284, + "num_tokens": 41037151.0, + "step": 1611 + }, + { + "epoch": 0.17702613661322206, + "grad_norm": 2.0530786514282227, + "learning_rate": 8.841931942919868e-07, + "loss": 0.9745, + "mean_token_accuracy": 0.7029513716697693, + "num_tokens": 41065485.0, + "step": 1612 + }, + { + "epoch": 0.1771359543158357, + "grad_norm": 2.5082931518554688, + "learning_rate": 8.84742041712404e-07, + "loss": 0.9355, + "mean_token_accuracy": 0.7199004292488098, + "num_tokens": 41087404.0, + "step": 1613 + }, + { + "epoch": 0.17724577201844938, + "grad_norm": 2.357074737548828, + "learning_rate": 8.85290889132821e-07, + "loss": 1.0567, + "mean_token_accuracy": 0.6838794946670532, + "num_tokens": 41113521.0, + "step": 1614 + }, + { + "epoch": 0.17735558972106302, + "grad_norm": 2.429607629776001, + "learning_rate": 8.858397365532382e-07, + "loss": 1.0136, + "mean_token_accuracy": 0.6966495513916016, + "num_tokens": 41135813.0, + "step": 1615 + }, + { + "epoch": 0.1774654074236767, + "grad_norm": 2.432466983795166, + "learning_rate": 8.863885839736552e-07, + "loss": 1.074, + "mean_token_accuracy": 0.6819047927856445, + "num_tokens": 41159471.0, + "step": 1616 + }, + { + "epoch": 0.17757522512629037, + "grad_norm": 2.316542387008667, + "learning_rate": 8.869374313940724e-07, + "loss": 0.9288, + "mean_token_accuracy": 0.7185032963752747, + "num_tokens": 41181636.0, + "step": 1617 + }, + { + "epoch": 0.17768504282890402, + "grad_norm": 2.8948636054992676, + "learning_rate": 8.874862788144896e-07, + "loss": 0.8992, + "mean_token_accuracy": 0.7221015691757202, + "num_tokens": 41198661.0, + "step": 1618 + }, + { + "epoch": 0.1777948605315177, + "grad_norm": 2.3873918056488037, + "learning_rate": 8.880351262349066e-07, + "loss": 1.0547, + "mean_token_accuracy": 0.6907184720039368, + "num_tokens": 41222708.0, + "step": 1619 + }, + { + "epoch": 0.17790467823413134, + "grad_norm": 2.423138380050659, + "learning_rate": 8.885839736553238e-07, + "loss": 1.0238, + "mean_token_accuracy": 0.6996645927429199, + "num_tokens": 41245900.0, + "step": 1620 + }, + { + "epoch": 0.178014495936745, + "grad_norm": 2.2259151935577393, + "learning_rate": 8.89132821075741e-07, + "loss": 1.0617, + "mean_token_accuracy": 0.6819493770599365, + "num_tokens": 41274121.0, + "step": 1621 + }, + { + "epoch": 0.17812431363935866, + "grad_norm": 2.3535637855529785, + "learning_rate": 8.89681668496158e-07, + "loss": 1.0857, + "mean_token_accuracy": 0.6749168634414673, + "num_tokens": 41298908.0, + "step": 1622 + }, + { + "epoch": 0.17823413134197233, + "grad_norm": 2.5632214546203613, + "learning_rate": 8.902305159165752e-07, + "loss": 0.9894, + "mean_token_accuracy": 0.6987619400024414, + "num_tokens": 41318633.0, + "step": 1623 + }, + { + "epoch": 0.17834394904458598, + "grad_norm": 2.2013025283813477, + "learning_rate": 8.907793633369924e-07, + "loss": 1.0698, + "mean_token_accuracy": 0.6874533295631409, + "num_tokens": 41346878.0, + "step": 1624 + }, + { + "epoch": 0.17845376674719965, + "grad_norm": 2.1651597023010254, + "learning_rate": 8.913282107574093e-07, + "loss": 1.0217, + "mean_token_accuracy": 0.6990036964416504, + "num_tokens": 41374994.0, + "step": 1625 + }, + { + "epoch": 0.17856358444981332, + "grad_norm": 2.812080144882202, + "learning_rate": 8.918770581778265e-07, + "loss": 0.9976, + "mean_token_accuracy": 0.6980483531951904, + "num_tokens": 41394315.0, + "step": 1626 + }, + { + "epoch": 0.17867340215242697, + "grad_norm": 2.4832100868225098, + "learning_rate": 8.924259055982436e-07, + "loss": 1.0825, + "mean_token_accuracy": 0.6822511553764343, + "num_tokens": 41417446.0, + "step": 1627 + }, + { + "epoch": 0.17878321985504064, + "grad_norm": 2.663322925567627, + "learning_rate": 8.929747530186607e-07, + "loss": 0.9554, + "mean_token_accuracy": 0.7098495960235596, + "num_tokens": 41436423.0, + "step": 1628 + }, + { + "epoch": 0.1788930375576543, + "grad_norm": 2.2723097801208496, + "learning_rate": 8.935236004390779e-07, + "loss": 1.068, + "mean_token_accuracy": 0.6839314103126526, + "num_tokens": 41460897.0, + "step": 1629 + }, + { + "epoch": 0.17900285526026796, + "grad_norm": 2.2178800106048584, + "learning_rate": 8.94072447859495e-07, + "loss": 1.0455, + "mean_token_accuracy": 0.6925832033157349, + "num_tokens": 41489891.0, + "step": 1630 + }, + { + "epoch": 0.1791126729628816, + "grad_norm": 2.2935473918914795, + "learning_rate": 8.946212952799121e-07, + "loss": 1.0637, + "mean_token_accuracy": 0.7001935243606567, + "num_tokens": 41515044.0, + "step": 1631 + }, + { + "epoch": 0.17922249066549528, + "grad_norm": 2.595789909362793, + "learning_rate": 8.951701427003293e-07, + "loss": 1.0152, + "mean_token_accuracy": 0.6916059851646423, + "num_tokens": 41535245.0, + "step": 1632 + }, + { + "epoch": 0.17933230836810893, + "grad_norm": 2.099895477294922, + "learning_rate": 8.957189901207464e-07, + "loss": 1.0006, + "mean_token_accuracy": 0.7015262246131897, + "num_tokens": 41562712.0, + "step": 1633 + }, + { + "epoch": 0.1794421260707226, + "grad_norm": 2.3262274265289307, + "learning_rate": 8.962678375411635e-07, + "loss": 1.0591, + "mean_token_accuracy": 0.6808437705039978, + "num_tokens": 41587862.0, + "step": 1634 + }, + { + "epoch": 0.17955194377333628, + "grad_norm": 2.377802848815918, + "learning_rate": 8.968166849615807e-07, + "loss": 0.9413, + "mean_token_accuracy": 0.719803512096405, + "num_tokens": 41611330.0, + "step": 1635 + }, + { + "epoch": 0.17966176147594992, + "grad_norm": 2.290066719055176, + "learning_rate": 8.973655323819978e-07, + "loss": 1.0027, + "mean_token_accuracy": 0.7023781538009644, + "num_tokens": 41636701.0, + "step": 1636 + }, + { + "epoch": 0.1797715791785636, + "grad_norm": 2.3310186862945557, + "learning_rate": 8.979143798024149e-07, + "loss": 0.9295, + "mean_token_accuracy": 0.7180980443954468, + "num_tokens": 41660358.0, + "step": 1637 + }, + { + "epoch": 0.17988139688117724, + "grad_norm": 2.331022262573242, + "learning_rate": 8.98463227222832e-07, + "loss": 1.0032, + "mean_token_accuracy": 0.7010117769241333, + "num_tokens": 41683738.0, + "step": 1638 + }, + { + "epoch": 0.17999121458379091, + "grad_norm": 2.2705516815185547, + "learning_rate": 8.990120746432492e-07, + "loss": 1.0366, + "mean_token_accuracy": 0.6956945657730103, + "num_tokens": 41709225.0, + "step": 1639 + }, + { + "epoch": 0.18010103228640456, + "grad_norm": 2.362025499343872, + "learning_rate": 8.995609220636663e-07, + "loss": 0.939, + "mean_token_accuracy": 0.7150018215179443, + "num_tokens": 41733532.0, + "step": 1640 + }, + { + "epoch": 0.18021084998901823, + "grad_norm": 2.229497194290161, + "learning_rate": 9.001097694840834e-07, + "loss": 1.0554, + "mean_token_accuracy": 0.6851367950439453, + "num_tokens": 41763557.0, + "step": 1641 + }, + { + "epoch": 0.18032066769163188, + "grad_norm": 2.649557590484619, + "learning_rate": 9.006586169045006e-07, + "loss": 0.8878, + "mean_token_accuracy": 0.7230219841003418, + "num_tokens": 41783970.0, + "step": 1642 + }, + { + "epoch": 0.18043048539424555, + "grad_norm": 1.9721938371658325, + "learning_rate": 9.012074643249177e-07, + "loss": 0.9691, + "mean_token_accuracy": 0.709722638130188, + "num_tokens": 41817695.0, + "step": 1643 + }, + { + "epoch": 0.1805403030968592, + "grad_norm": 2.148643970489502, + "learning_rate": 9.017563117453347e-07, + "loss": 1.1185, + "mean_token_accuracy": 0.6689969301223755, + "num_tokens": 41848722.0, + "step": 1644 + }, + { + "epoch": 0.18065012079947287, + "grad_norm": 2.2905428409576416, + "learning_rate": 9.023051591657519e-07, + "loss": 1.0575, + "mean_token_accuracy": 0.6794667840003967, + "num_tokens": 41873121.0, + "step": 1645 + }, + { + "epoch": 0.18075993850208655, + "grad_norm": 2.275790214538574, + "learning_rate": 9.02854006586169e-07, + "loss": 1.0365, + "mean_token_accuracy": 0.6984782218933105, + "num_tokens": 41899036.0, + "step": 1646 + }, + { + "epoch": 0.1808697562047002, + "grad_norm": 2.4419116973876953, + "learning_rate": 9.034028540065861e-07, + "loss": 1.0868, + "mean_token_accuracy": 0.6725323796272278, + "num_tokens": 41922947.0, + "step": 1647 + }, + { + "epoch": 0.18097957390731387, + "grad_norm": 2.433535575866699, + "learning_rate": 9.039517014270033e-07, + "loss": 1.1329, + "mean_token_accuracy": 0.6642831563949585, + "num_tokens": 41947701.0, + "step": 1648 + }, + { + "epoch": 0.1810893916099275, + "grad_norm": 2.543323040008545, + "learning_rate": 9.045005488474203e-07, + "loss": 1.0269, + "mean_token_accuracy": 0.6918759346008301, + "num_tokens": 41967407.0, + "step": 1649 + }, + { + "epoch": 0.1811992093125412, + "grad_norm": 2.0631346702575684, + "learning_rate": 9.050493962678375e-07, + "loss": 1.0108, + "mean_token_accuracy": 0.6958125829696655, + "num_tokens": 41997017.0, + "step": 1650 + }, + { + "epoch": 0.18130902701515483, + "grad_norm": 2.533071517944336, + "learning_rate": 9.055982436882547e-07, + "loss": 1.0017, + "mean_token_accuracy": 0.6941630840301514, + "num_tokens": 42018466.0, + "step": 1651 + }, + { + "epoch": 0.1814188447177685, + "grad_norm": 2.176034450531006, + "learning_rate": 9.061470911086717e-07, + "loss": 1.0719, + "mean_token_accuracy": 0.6800273656845093, + "num_tokens": 42045517.0, + "step": 1652 + }, + { + "epoch": 0.18152866242038215, + "grad_norm": 2.2874789237976074, + "learning_rate": 9.066959385290889e-07, + "loss": 1.0829, + "mean_token_accuracy": 0.6803515553474426, + "num_tokens": 42072452.0, + "step": 1653 + }, + { + "epoch": 0.18163848012299583, + "grad_norm": 2.127419948577881, + "learning_rate": 9.07244785949506e-07, + "loss": 1.0702, + "mean_token_accuracy": 0.6833619475364685, + "num_tokens": 42101694.0, + "step": 1654 + }, + { + "epoch": 0.1817482978256095, + "grad_norm": 2.2437727451324463, + "learning_rate": 9.077936333699231e-07, + "loss": 0.9879, + "mean_token_accuracy": 0.6991247534751892, + "num_tokens": 42128810.0, + "step": 1655 + }, + { + "epoch": 0.18185811552822315, + "grad_norm": 2.226414680480957, + "learning_rate": 9.083424807903403e-07, + "loss": 0.9383, + "mean_token_accuracy": 0.7177004814147949, + "num_tokens": 42154731.0, + "step": 1656 + }, + { + "epoch": 0.18196793323083682, + "grad_norm": 2.4945013523101807, + "learning_rate": 9.088913282107573e-07, + "loss": 1.0899, + "mean_token_accuracy": 0.6743139624595642, + "num_tokens": 42180665.0, + "step": 1657 + }, + { + "epoch": 0.18207775093345047, + "grad_norm": 2.3913519382476807, + "learning_rate": 9.094401756311745e-07, + "loss": 1.0202, + "mean_token_accuracy": 0.6910874843597412, + "num_tokens": 42203435.0, + "step": 1658 + }, + { + "epoch": 0.18218756863606414, + "grad_norm": 2.5207676887512207, + "learning_rate": 9.099890230515917e-07, + "loss": 1.0873, + "mean_token_accuracy": 0.6725069880485535, + "num_tokens": 42225948.0, + "step": 1659 + }, + { + "epoch": 0.18229738633867779, + "grad_norm": 2.4594926834106445, + "learning_rate": 9.105378704720087e-07, + "loss": 0.9962, + "mean_token_accuracy": 0.7073666453361511, + "num_tokens": 42248189.0, + "step": 1660 + }, + { + "epoch": 0.18240720404129146, + "grad_norm": 2.3422539234161377, + "learning_rate": 9.110867178924259e-07, + "loss": 1.0868, + "mean_token_accuracy": 0.6831672787666321, + "num_tokens": 42271326.0, + "step": 1661 + }, + { + "epoch": 0.1825170217439051, + "grad_norm": 2.015625, + "learning_rate": 9.11635565312843e-07, + "loss": 1.0044, + "mean_token_accuracy": 0.6948727965354919, + "num_tokens": 42300512.0, + "step": 1662 + }, + { + "epoch": 0.18262683944651878, + "grad_norm": 2.709785223007202, + "learning_rate": 9.1218441273326e-07, + "loss": 0.9854, + "mean_token_accuracy": 0.7049053907394409, + "num_tokens": 42319748.0, + "step": 1663 + }, + { + "epoch": 0.18273665714913245, + "grad_norm": 2.3221890926361084, + "learning_rate": 9.127332601536772e-07, + "loss": 0.9994, + "mean_token_accuracy": 0.701573371887207, + "num_tokens": 42343919.0, + "step": 1664 + }, + { + "epoch": 0.1828464748517461, + "grad_norm": 2.389195442199707, + "learning_rate": 9.132821075740944e-07, + "loss": 1.055, + "mean_token_accuracy": 0.6947392225265503, + "num_tokens": 42368563.0, + "step": 1665 + }, + { + "epoch": 0.18295629255435977, + "grad_norm": 2.4554967880249023, + "learning_rate": 9.138309549945114e-07, + "loss": 1.0449, + "mean_token_accuracy": 0.6892123222351074, + "num_tokens": 42390671.0, + "step": 1666 + }, + { + "epoch": 0.18306611025697342, + "grad_norm": 2.247244119644165, + "learning_rate": 9.143798024149286e-07, + "loss": 0.9622, + "mean_token_accuracy": 0.7084134817123413, + "num_tokens": 42416568.0, + "step": 1667 + }, + { + "epoch": 0.1831759279595871, + "grad_norm": 2.2131834030151367, + "learning_rate": 9.149286498353457e-07, + "loss": 1.0344, + "mean_token_accuracy": 0.7042924165725708, + "num_tokens": 42444382.0, + "step": 1668 + }, + { + "epoch": 0.18328574566220074, + "grad_norm": 2.4389026165008545, + "learning_rate": 9.154774972557628e-07, + "loss": 1.0211, + "mean_token_accuracy": 0.6907708644866943, + "num_tokens": 42468516.0, + "step": 1669 + }, + { + "epoch": 0.1833955633648144, + "grad_norm": 2.3490965366363525, + "learning_rate": 9.1602634467618e-07, + "loss": 1.0545, + "mean_token_accuracy": 0.6886712312698364, + "num_tokens": 42493775.0, + "step": 1670 + }, + { + "epoch": 0.18350538106742806, + "grad_norm": 2.4222946166992188, + "learning_rate": 9.165751920965971e-07, + "loss": 0.9519, + "mean_token_accuracy": 0.7084990739822388, + "num_tokens": 42516582.0, + "step": 1671 + }, + { + "epoch": 0.18361519877004173, + "grad_norm": 2.4085745811462402, + "learning_rate": 9.171240395170142e-07, + "loss": 0.9921, + "mean_token_accuracy": 0.7077127695083618, + "num_tokens": 42539910.0, + "step": 1672 + }, + { + "epoch": 0.1837250164726554, + "grad_norm": 2.1797707080841064, + "learning_rate": 9.176728869374314e-07, + "loss": 1.0798, + "mean_token_accuracy": 0.6765793561935425, + "num_tokens": 42568459.0, + "step": 1673 + }, + { + "epoch": 0.18383483417526905, + "grad_norm": 2.5866053104400635, + "learning_rate": 9.182217343578485e-07, + "loss": 1.0618, + "mean_token_accuracy": 0.6921021938323975, + "num_tokens": 42589655.0, + "step": 1674 + }, + { + "epoch": 0.18394465187788273, + "grad_norm": 2.2141923904418945, + "learning_rate": 9.187705817782656e-07, + "loss": 1.0115, + "mean_token_accuracy": 0.7037272453308105, + "num_tokens": 42615034.0, + "step": 1675 + }, + { + "epoch": 0.18405446958049637, + "grad_norm": 2.3137195110321045, + "learning_rate": 9.193194291986828e-07, + "loss": 1.0773, + "mean_token_accuracy": 0.6836887001991272, + "num_tokens": 42639332.0, + "step": 1676 + }, + { + "epoch": 0.18416428728311005, + "grad_norm": 2.599632978439331, + "learning_rate": 9.198682766190999e-07, + "loss": 1.0152, + "mean_token_accuracy": 0.6994372606277466, + "num_tokens": 42660686.0, + "step": 1677 + }, + { + "epoch": 0.1842741049857237, + "grad_norm": 2.1367318630218506, + "learning_rate": 9.20417124039517e-07, + "loss": 1.1301, + "mean_token_accuracy": 0.6677296161651611, + "num_tokens": 42689708.0, + "step": 1678 + }, + { + "epoch": 0.18438392268833736, + "grad_norm": 2.2861692905426025, + "learning_rate": 9.209659714599341e-07, + "loss": 1.0785, + "mean_token_accuracy": 0.6809384822845459, + "num_tokens": 42715390.0, + "step": 1679 + }, + { + "epoch": 0.184493740390951, + "grad_norm": 2.054123640060425, + "learning_rate": 9.215148188803513e-07, + "loss": 1.081, + "mean_token_accuracy": 0.676995038986206, + "num_tokens": 42747179.0, + "step": 1680 + }, + { + "epoch": 0.18460355809356468, + "grad_norm": 1.8735171556472778, + "learning_rate": 9.220636663007683e-07, + "loss": 1.0077, + "mean_token_accuracy": 0.7010002732276917, + "num_tokens": 42782393.0, + "step": 1681 + }, + { + "epoch": 0.18471337579617833, + "grad_norm": 2.4830193519592285, + "learning_rate": 9.226125137211854e-07, + "loss": 1.0755, + "mean_token_accuracy": 0.6811065077781677, + "num_tokens": 42806081.0, + "step": 1682 + }, + { + "epoch": 0.184823193498792, + "grad_norm": 2.126339912414551, + "learning_rate": 9.231613611416026e-07, + "loss": 1.0523, + "mean_token_accuracy": 0.6896158456802368, + "num_tokens": 42834463.0, + "step": 1683 + }, + { + "epoch": 0.18493301120140568, + "grad_norm": 2.5101401805877686, + "learning_rate": 9.237102085620197e-07, + "loss": 0.978, + "mean_token_accuracy": 0.698506236076355, + "num_tokens": 42857652.0, + "step": 1684 + }, + { + "epoch": 0.18504282890401932, + "grad_norm": 2.2740209102630615, + "learning_rate": 9.242590559824368e-07, + "loss": 0.9412, + "mean_token_accuracy": 0.7163438200950623, + "num_tokens": 42882777.0, + "step": 1685 + }, + { + "epoch": 0.185152646606633, + "grad_norm": 2.475554943084717, + "learning_rate": 9.24807903402854e-07, + "loss": 1.0088, + "mean_token_accuracy": 0.7055407762527466, + "num_tokens": 42905230.0, + "step": 1686 + }, + { + "epoch": 0.18526246430924664, + "grad_norm": 2.115831136703491, + "learning_rate": 9.253567508232711e-07, + "loss": 1.0499, + "mean_token_accuracy": 0.689029335975647, + "num_tokens": 42934458.0, + "step": 1687 + }, + { + "epoch": 0.18537228201186032, + "grad_norm": 2.3698885440826416, + "learning_rate": 9.259055982436882e-07, + "loss": 1.0423, + "mean_token_accuracy": 0.6889275908470154, + "num_tokens": 42958575.0, + "step": 1688 + }, + { + "epoch": 0.18548209971447396, + "grad_norm": 2.3562047481536865, + "learning_rate": 9.264544456641053e-07, + "loss": 0.8794, + "mean_token_accuracy": 0.7325390577316284, + "num_tokens": 42980974.0, + "step": 1689 + }, + { + "epoch": 0.18559191741708764, + "grad_norm": 2.596724033355713, + "learning_rate": 9.270032930845224e-07, + "loss": 1.0009, + "mean_token_accuracy": 0.6985872983932495, + "num_tokens": 43001606.0, + "step": 1690 + }, + { + "epoch": 0.18570173511970128, + "grad_norm": 2.3397233486175537, + "learning_rate": 9.275521405049396e-07, + "loss": 1.0247, + "mean_token_accuracy": 0.6975481510162354, + "num_tokens": 43025670.0, + "step": 1691 + }, + { + "epoch": 0.18581155282231496, + "grad_norm": 3.3710343837738037, + "learning_rate": 9.281009879253567e-07, + "loss": 1.0005, + "mean_token_accuracy": 0.6964130401611328, + "num_tokens": 43047839.0, + "step": 1692 + }, + { + "epoch": 0.18592137052492863, + "grad_norm": 2.0687241554260254, + "learning_rate": 9.286498353457738e-07, + "loss": 0.9536, + "mean_token_accuracy": 0.7182705402374268, + "num_tokens": 43077539.0, + "step": 1693 + }, + { + "epoch": 0.18603118822754228, + "grad_norm": 2.0885140895843506, + "learning_rate": 9.29198682766191e-07, + "loss": 1.0196, + "mean_token_accuracy": 0.6932157278060913, + "num_tokens": 43105945.0, + "step": 1694 + }, + { + "epoch": 0.18614100593015595, + "grad_norm": 2.1944522857666016, + "learning_rate": 9.297475301866081e-07, + "loss": 1.0459, + "mean_token_accuracy": 0.6854881048202515, + "num_tokens": 43133669.0, + "step": 1695 + }, + { + "epoch": 0.1862508236327696, + "grad_norm": 2.531160593032837, + "learning_rate": 9.302963776070252e-07, + "loss": 1.0409, + "mean_token_accuracy": 0.6880088448524475, + "num_tokens": 43155746.0, + "step": 1696 + }, + { + "epoch": 0.18636064133538327, + "grad_norm": 2.44386887550354, + "learning_rate": 9.308452250274424e-07, + "loss": 1.0444, + "mean_token_accuracy": 0.6918287873268127, + "num_tokens": 43177894.0, + "step": 1697 + }, + { + "epoch": 0.18647045903799692, + "grad_norm": 2.3530287742614746, + "learning_rate": 9.313940724478595e-07, + "loss": 1.0799, + "mean_token_accuracy": 0.6759760975837708, + "num_tokens": 43201575.0, + "step": 1698 + }, + { + "epoch": 0.1865802767406106, + "grad_norm": 2.5180954933166504, + "learning_rate": 9.319429198682766e-07, + "loss": 0.984, + "mean_token_accuracy": 0.7014996409416199, + "num_tokens": 43222283.0, + "step": 1699 + }, + { + "epoch": 0.18669009444322424, + "grad_norm": 2.091529130935669, + "learning_rate": 9.324917672886937e-07, + "loss": 1.0756, + "mean_token_accuracy": 0.675119161605835, + "num_tokens": 43251735.0, + "step": 1700 + }, + { + "epoch": 0.1867999121458379, + "grad_norm": 2.2766733169555664, + "learning_rate": 9.330406147091107e-07, + "loss": 1.0431, + "mean_token_accuracy": 0.6821393966674805, + "num_tokens": 43278061.0, + "step": 1701 + }, + { + "epoch": 0.18690972984845158, + "grad_norm": 2.334796667098999, + "learning_rate": 9.335894621295279e-07, + "loss": 1.0488, + "mean_token_accuracy": 0.6981422901153564, + "num_tokens": 43302828.0, + "step": 1702 + }, + { + "epoch": 0.18701954755106523, + "grad_norm": 2.2200138568878174, + "learning_rate": 9.341383095499451e-07, + "loss": 1.0209, + "mean_token_accuracy": 0.6964321136474609, + "num_tokens": 43329802.0, + "step": 1703 + }, + { + "epoch": 0.1871293652536789, + "grad_norm": 2.4796502590179443, + "learning_rate": 9.346871569703621e-07, + "loss": 0.9773, + "mean_token_accuracy": 0.7014026045799255, + "num_tokens": 43351764.0, + "step": 1704 + }, + { + "epoch": 0.18723918295629255, + "grad_norm": 2.3380682468414307, + "learning_rate": 9.352360043907793e-07, + "loss": 1.0895, + "mean_token_accuracy": 0.6802718043327332, + "num_tokens": 43375303.0, + "step": 1705 + }, + { + "epoch": 0.18734900065890622, + "grad_norm": 2.38206148147583, + "learning_rate": 9.357848518111965e-07, + "loss": 0.9386, + "mean_token_accuracy": 0.7103179693222046, + "num_tokens": 43400671.0, + "step": 1706 + }, + { + "epoch": 0.18745881836151987, + "grad_norm": 2.173250436782837, + "learning_rate": 9.363336992316135e-07, + "loss": 1.023, + "mean_token_accuracy": 0.6898442506790161, + "num_tokens": 43430839.0, + "step": 1707 + }, + { + "epoch": 0.18756863606413354, + "grad_norm": 2.044947624206543, + "learning_rate": 9.368825466520307e-07, + "loss": 0.9678, + "mean_token_accuracy": 0.710448145866394, + "num_tokens": 43461390.0, + "step": 1708 + }, + { + "epoch": 0.1876784537667472, + "grad_norm": 2.2735307216644287, + "learning_rate": 9.374313940724479e-07, + "loss": 1.0726, + "mean_token_accuracy": 0.6814028024673462, + "num_tokens": 43487563.0, + "step": 1709 + }, + { + "epoch": 0.18778827146936086, + "grad_norm": 2.9141736030578613, + "learning_rate": 9.379802414928649e-07, + "loss": 0.98, + "mean_token_accuracy": 0.7022299766540527, + "num_tokens": 43506128.0, + "step": 1710 + }, + { + "epoch": 0.18789808917197454, + "grad_norm": 2.272614002227783, + "learning_rate": 9.385290889132821e-07, + "loss": 1.1105, + "mean_token_accuracy": 0.667958676815033, + "num_tokens": 43536250.0, + "step": 1711 + }, + { + "epoch": 0.18800790687458818, + "grad_norm": 2.3867218494415283, + "learning_rate": 9.390779363336992e-07, + "loss": 1.0558, + "mean_token_accuracy": 0.6855216026306152, + "num_tokens": 43562303.0, + "step": 1712 + }, + { + "epoch": 0.18811772457720186, + "grad_norm": 2.281416654586792, + "learning_rate": 9.396267837541163e-07, + "loss": 1.0438, + "mean_token_accuracy": 0.6881549954414368, + "num_tokens": 43588251.0, + "step": 1713 + }, + { + "epoch": 0.1882275422798155, + "grad_norm": 2.0473217964172363, + "learning_rate": 9.401756311745335e-07, + "loss": 1.0471, + "mean_token_accuracy": 0.6946394443511963, + "num_tokens": 43622054.0, + "step": 1714 + }, + { + "epoch": 0.18833735998242918, + "grad_norm": 2.22812819480896, + "learning_rate": 9.407244785949506e-07, + "loss": 1.0201, + "mean_token_accuracy": 0.6959840059280396, + "num_tokens": 43648281.0, + "step": 1715 + }, + { + "epoch": 0.18844717768504282, + "grad_norm": 2.150712728500366, + "learning_rate": 9.412733260153677e-07, + "loss": 1.0723, + "mean_token_accuracy": 0.6819508671760559, + "num_tokens": 43676529.0, + "step": 1716 + }, + { + "epoch": 0.1885569953876565, + "grad_norm": 2.2953028678894043, + "learning_rate": 9.418221734357849e-07, + "loss": 0.9404, + "mean_token_accuracy": 0.7184309959411621, + "num_tokens": 43700664.0, + "step": 1717 + }, + { + "epoch": 0.18866681309027014, + "grad_norm": 2.142855405807495, + "learning_rate": 9.42371020856202e-07, + "loss": 1.0455, + "mean_token_accuracy": 0.6965863108634949, + "num_tokens": 43727339.0, + "step": 1718 + }, + { + "epoch": 0.18877663079288381, + "grad_norm": 2.5983974933624268, + "learning_rate": 9.42919868276619e-07, + "loss": 1.029, + "mean_token_accuracy": 0.685032069683075, + "num_tokens": 43746578.0, + "step": 1719 + }, + { + "epoch": 0.18888644849549746, + "grad_norm": 2.035517930984497, + "learning_rate": 9.434687156970362e-07, + "loss": 0.9721, + "mean_token_accuracy": 0.6935447454452515, + "num_tokens": 43775161.0, + "step": 1720 + }, + { + "epoch": 0.18899626619811113, + "grad_norm": 2.128868818283081, + "learning_rate": 9.440175631174532e-07, + "loss": 1.0015, + "mean_token_accuracy": 0.6996764540672302, + "num_tokens": 43802832.0, + "step": 1721 + }, + { + "epoch": 0.1891060839007248, + "grad_norm": 2.246610403060913, + "learning_rate": 9.445664105378704e-07, + "loss": 1.0566, + "mean_token_accuracy": 0.6806198358535767, + "num_tokens": 43830488.0, + "step": 1722 + }, + { + "epoch": 0.18921590160333845, + "grad_norm": 2.067171573638916, + "learning_rate": 9.451152579582875e-07, + "loss": 1.0326, + "mean_token_accuracy": 0.6947364807128906, + "num_tokens": 43860044.0, + "step": 1723 + }, + { + "epoch": 0.18932571930595213, + "grad_norm": 2.1904211044311523, + "learning_rate": 9.456641053787046e-07, + "loss": 0.931, + "mean_token_accuracy": 0.7069010734558105, + "num_tokens": 43885954.0, + "step": 1724 + }, + { + "epoch": 0.18943553700856577, + "grad_norm": 2.3664071559906006, + "learning_rate": 9.462129527991218e-07, + "loss": 1.0513, + "mean_token_accuracy": 0.6877557635307312, + "num_tokens": 43909702.0, + "step": 1725 + }, + { + "epoch": 0.18954535471117945, + "grad_norm": 2.163872480392456, + "learning_rate": 9.467618002195389e-07, + "loss": 1.0881, + "mean_token_accuracy": 0.6879744529724121, + "num_tokens": 43939406.0, + "step": 1726 + }, + { + "epoch": 0.1896551724137931, + "grad_norm": 2.314577341079712, + "learning_rate": 9.47310647639956e-07, + "loss": 0.94, + "mean_token_accuracy": 0.7139276266098022, + "num_tokens": 43965370.0, + "step": 1727 + }, + { + "epoch": 0.18976499011640677, + "grad_norm": 2.236729621887207, + "learning_rate": 9.478594950603732e-07, + "loss": 1.0526, + "mean_token_accuracy": 0.6827130913734436, + "num_tokens": 43991788.0, + "step": 1728 + }, + { + "epoch": 0.1898748078190204, + "grad_norm": 2.454923629760742, + "learning_rate": 9.484083424807903e-07, + "loss": 0.9847, + "mean_token_accuracy": 0.7010089159011841, + "num_tokens": 44014001.0, + "step": 1729 + }, + { + "epoch": 0.1899846255216341, + "grad_norm": 2.0369186401367188, + "learning_rate": 9.489571899012074e-07, + "loss": 1.052, + "mean_token_accuracy": 0.6886621713638306, + "num_tokens": 44045231.0, + "step": 1730 + }, + { + "epoch": 0.19009444322424776, + "grad_norm": 1.9862451553344727, + "learning_rate": 9.495060373216246e-07, + "loss": 0.9552, + "mean_token_accuracy": 0.7084080576896667, + "num_tokens": 44075662.0, + "step": 1731 + }, + { + "epoch": 0.1902042609268614, + "grad_norm": 2.2896335124969482, + "learning_rate": 9.500548847420417e-07, + "loss": 0.9941, + "mean_token_accuracy": 0.6987180709838867, + "num_tokens": 44100359.0, + "step": 1732 + }, + { + "epoch": 0.19031407862947508, + "grad_norm": 2.431103467941284, + "learning_rate": 9.506037321624588e-07, + "loss": 0.9488, + "mean_token_accuracy": 0.707209587097168, + "num_tokens": 44122559.0, + "step": 1733 + }, + { + "epoch": 0.19042389633208873, + "grad_norm": 2.4638566970825195, + "learning_rate": 9.511525795828759e-07, + "loss": 1.0371, + "mean_token_accuracy": 0.6906691789627075, + "num_tokens": 44145689.0, + "step": 1734 + }, + { + "epoch": 0.1905337140347024, + "grad_norm": 2.0381581783294678, + "learning_rate": 9.517014270032931e-07, + "loss": 1.1039, + "mean_token_accuracy": 0.679740309715271, + "num_tokens": 44177516.0, + "step": 1735 + }, + { + "epoch": 0.19064353173731605, + "grad_norm": 2.520496368408203, + "learning_rate": 9.522502744237102e-07, + "loss": 0.9518, + "mean_token_accuracy": 0.7118402123451233, + "num_tokens": 44199240.0, + "step": 1736 + }, + { + "epoch": 0.19075334943992972, + "grad_norm": 2.0084145069122314, + "learning_rate": 9.527991218441273e-07, + "loss": 1.068, + "mean_token_accuracy": 0.6807703375816345, + "num_tokens": 44233240.0, + "step": 1737 + }, + { + "epoch": 0.19086316714254337, + "grad_norm": 2.0458412170410156, + "learning_rate": 9.533479692645444e-07, + "loss": 1.0892, + "mean_token_accuracy": 0.6740036606788635, + "num_tokens": 44265828.0, + "step": 1738 + }, + { + "epoch": 0.19097298484515704, + "grad_norm": 2.407451868057251, + "learning_rate": 9.538968166849616e-07, + "loss": 0.9318, + "mean_token_accuracy": 0.714912474155426, + "num_tokens": 44289172.0, + "step": 1739 + }, + { + "epoch": 0.1910828025477707, + "grad_norm": 2.5644235610961914, + "learning_rate": 9.544456641053787e-07, + "loss": 0.9592, + "mean_token_accuracy": 0.709470272064209, + "num_tokens": 44308555.0, + "step": 1740 + }, + { + "epoch": 0.19119262025038436, + "grad_norm": 2.238461971282959, + "learning_rate": 9.549945115257958e-07, + "loss": 1.0897, + "mean_token_accuracy": 0.6755696535110474, + "num_tokens": 44337166.0, + "step": 1741 + }, + { + "epoch": 0.19130243795299803, + "grad_norm": 2.2709150314331055, + "learning_rate": 9.55543358946213e-07, + "loss": 0.9734, + "mean_token_accuracy": 0.7020735740661621, + "num_tokens": 44362293.0, + "step": 1742 + }, + { + "epoch": 0.19141225565561168, + "grad_norm": 2.0032641887664795, + "learning_rate": 9.5609220636663e-07, + "loss": 1.037, + "mean_token_accuracy": 0.6830050945281982, + "num_tokens": 44394328.0, + "step": 1743 + }, + { + "epoch": 0.19152207335822535, + "grad_norm": 2.290710926055908, + "learning_rate": 9.566410537870472e-07, + "loss": 1.0686, + "mean_token_accuracy": 0.6776418685913086, + "num_tokens": 44419121.0, + "step": 1744 + }, + { + "epoch": 0.191631891060839, + "grad_norm": 2.1207098960876465, + "learning_rate": 9.571899012074642e-07, + "loss": 1.0828, + "mean_token_accuracy": 0.6760983467102051, + "num_tokens": 44449315.0, + "step": 1745 + }, + { + "epoch": 0.19174170876345267, + "grad_norm": 2.306079387664795, + "learning_rate": 9.577387486278815e-07, + "loss": 1.0119, + "mean_token_accuracy": 0.6940192580223083, + "num_tokens": 44473579.0, + "step": 1746 + }, + { + "epoch": 0.19185152646606632, + "grad_norm": 2.3530852794647217, + "learning_rate": 9.582875960482986e-07, + "loss": 0.9118, + "mean_token_accuracy": 0.7176038026809692, + "num_tokens": 44496189.0, + "step": 1747 + }, + { + "epoch": 0.19196134416868, + "grad_norm": 2.189863681793213, + "learning_rate": 9.588364434687156e-07, + "loss": 1.0951, + "mean_token_accuracy": 0.6824553608894348, + "num_tokens": 44523893.0, + "step": 1748 + }, + { + "epoch": 0.19207116187129367, + "grad_norm": 2.5307092666625977, + "learning_rate": 9.593852908891327e-07, + "loss": 0.8978, + "mean_token_accuracy": 0.7205397486686707, + "num_tokens": 44544499.0, + "step": 1749 + }, + { + "epoch": 0.1921809795739073, + "grad_norm": 2.2298407554626465, + "learning_rate": 9.5993413830955e-07, + "loss": 0.988, + "mean_token_accuracy": 0.6997556090354919, + "num_tokens": 44568407.0, + "step": 1750 + }, + { + "epoch": 0.19229079727652099, + "grad_norm": 2.476222515106201, + "learning_rate": 9.60482985729967e-07, + "loss": 1.0579, + "mean_token_accuracy": 0.684773325920105, + "num_tokens": 44590871.0, + "step": 1751 + }, + { + "epoch": 0.19240061497913463, + "grad_norm": 2.2640416622161865, + "learning_rate": 9.61031833150384e-07, + "loss": 1.0087, + "mean_token_accuracy": 0.6946266293525696, + "num_tokens": 44616908.0, + "step": 1752 + }, + { + "epoch": 0.1925104326817483, + "grad_norm": 2.5508337020874023, + "learning_rate": 9.615806805708014e-07, + "loss": 1.0283, + "mean_token_accuracy": 0.6951305866241455, + "num_tokens": 44639702.0, + "step": 1753 + }, + { + "epoch": 0.19262025038436195, + "grad_norm": 2.063964605331421, + "learning_rate": 9.621295279912184e-07, + "loss": 1.084, + "mean_token_accuracy": 0.6778870224952698, + "num_tokens": 44668740.0, + "step": 1754 + }, + { + "epoch": 0.19273006808697563, + "grad_norm": 2.3249635696411133, + "learning_rate": 9.626783754116355e-07, + "loss": 0.9981, + "mean_token_accuracy": 0.7033670544624329, + "num_tokens": 44698638.0, + "step": 1755 + }, + { + "epoch": 0.19283988578958927, + "grad_norm": 2.648421049118042, + "learning_rate": 9.632272228320525e-07, + "loss": 0.9809, + "mean_token_accuracy": 0.702918291091919, + "num_tokens": 44717521.0, + "step": 1756 + }, + { + "epoch": 0.19294970349220295, + "grad_norm": 2.087240695953369, + "learning_rate": 9.637760702524698e-07, + "loss": 1.0627, + "mean_token_accuracy": 0.6830545663833618, + "num_tokens": 44746608.0, + "step": 1757 + }, + { + "epoch": 0.1930595211948166, + "grad_norm": 2.396552085876465, + "learning_rate": 9.643249176728869e-07, + "loss": 1.0344, + "mean_token_accuracy": 0.689480185508728, + "num_tokens": 44772017.0, + "step": 1758 + }, + { + "epoch": 0.19316933889743026, + "grad_norm": 2.2340166568756104, + "learning_rate": 9.64873765093304e-07, + "loss": 1.0427, + "mean_token_accuracy": 0.686553955078125, + "num_tokens": 44799031.0, + "step": 1759 + }, + { + "epoch": 0.19327915660004394, + "grad_norm": 2.2024247646331787, + "learning_rate": 9.654226125137212e-07, + "loss": 1.0408, + "mean_token_accuracy": 0.6888781189918518, + "num_tokens": 44826467.0, + "step": 1760 + }, + { + "epoch": 0.19338897430265758, + "grad_norm": 2.2515695095062256, + "learning_rate": 9.659714599341383e-07, + "loss": 1.1141, + "mean_token_accuracy": 0.6794174909591675, + "num_tokens": 44855624.0, + "step": 1761 + }, + { + "epoch": 0.19349879200527126, + "grad_norm": 2.006181240081787, + "learning_rate": 9.665203073545553e-07, + "loss": 1.1065, + "mean_token_accuracy": 0.6754329204559326, + "num_tokens": 44887847.0, + "step": 1762 + }, + { + "epoch": 0.1936086097078849, + "grad_norm": 2.4658825397491455, + "learning_rate": 9.670691547749726e-07, + "loss": 1.0612, + "mean_token_accuracy": 0.6772352457046509, + "num_tokens": 44913520.0, + "step": 1763 + }, + { + "epoch": 0.19371842741049858, + "grad_norm": 2.272181749343872, + "learning_rate": 9.676180021953897e-07, + "loss": 1.0224, + "mean_token_accuracy": 0.6922585964202881, + "num_tokens": 44940775.0, + "step": 1764 + }, + { + "epoch": 0.19382824511311222, + "grad_norm": 2.4024665355682373, + "learning_rate": 9.681668496158067e-07, + "loss": 0.9551, + "mean_token_accuracy": 0.7139402627944946, + "num_tokens": 44964747.0, + "step": 1765 + }, + { + "epoch": 0.1939380628157259, + "grad_norm": 2.6585805416107178, + "learning_rate": 9.68715697036224e-07, + "loss": 1.0221, + "mean_token_accuracy": 0.6905151009559631, + "num_tokens": 44984500.0, + "step": 1766 + }, + { + "epoch": 0.19404788051833954, + "grad_norm": 2.8163154125213623, + "learning_rate": 9.692645444566409e-07, + "loss": 1.0457, + "mean_token_accuracy": 0.6931679248809814, + "num_tokens": 45003416.0, + "step": 1767 + }, + { + "epoch": 0.19415769822095322, + "grad_norm": 2.366877794265747, + "learning_rate": 9.698133918770581e-07, + "loss": 1.029, + "mean_token_accuracy": 0.6931512355804443, + "num_tokens": 45026742.0, + "step": 1768 + }, + { + "epoch": 0.1942675159235669, + "grad_norm": 2.3637149333953857, + "learning_rate": 9.703622392974752e-07, + "loss": 1.0475, + "mean_token_accuracy": 0.6899123787879944, + "num_tokens": 45049776.0, + "step": 1769 + }, + { + "epoch": 0.19437733362618054, + "grad_norm": 2.294342279434204, + "learning_rate": 9.709110867178923e-07, + "loss": 0.967, + "mean_token_accuracy": 0.7066899538040161, + "num_tokens": 45073041.0, + "step": 1770 + }, + { + "epoch": 0.1944871513287942, + "grad_norm": 2.391401767730713, + "learning_rate": 9.714599341383095e-07, + "loss": 1.0868, + "mean_token_accuracy": 0.6788709163665771, + "num_tokens": 45097140.0, + "step": 1771 + }, + { + "epoch": 0.19459696903140786, + "grad_norm": 2.0453712940216064, + "learning_rate": 9.720087815587266e-07, + "loss": 0.9942, + "mean_token_accuracy": 0.7023631930351257, + "num_tokens": 45127246.0, + "step": 1772 + }, + { + "epoch": 0.19470678673402153, + "grad_norm": 2.63208270072937, + "learning_rate": 9.725576289791437e-07, + "loss": 1.0562, + "mean_token_accuracy": 0.6799378395080566, + "num_tokens": 45148253.0, + "step": 1773 + }, + { + "epoch": 0.19481660443663518, + "grad_norm": 2.2493181228637695, + "learning_rate": 9.73106476399561e-07, + "loss": 1.0107, + "mean_token_accuracy": 0.6900644898414612, + "num_tokens": 45175120.0, + "step": 1774 + }, + { + "epoch": 0.19492642213924885, + "grad_norm": 2.330166816711426, + "learning_rate": 9.73655323819978e-07, + "loss": 1.013, + "mean_token_accuracy": 0.692206859588623, + "num_tokens": 45201458.0, + "step": 1775 + }, + { + "epoch": 0.1950362398418625, + "grad_norm": 2.492206573486328, + "learning_rate": 9.74204171240395e-07, + "loss": 0.9833, + "mean_token_accuracy": 0.7038654685020447, + "num_tokens": 45223269.0, + "step": 1776 + }, + { + "epoch": 0.19514605754447617, + "grad_norm": 1.976956844329834, + "learning_rate": 9.747530186608123e-07, + "loss": 1.1149, + "mean_token_accuracy": 0.6664572954177856, + "num_tokens": 45259356.0, + "step": 1777 + }, + { + "epoch": 0.19525587524708984, + "grad_norm": 2.1031782627105713, + "learning_rate": 9.753018660812294e-07, + "loss": 1.107, + "mean_token_accuracy": 0.6671348810195923, + "num_tokens": 45288336.0, + "step": 1778 + }, + { + "epoch": 0.1953656929497035, + "grad_norm": 2.299863576889038, + "learning_rate": 9.758507135016465e-07, + "loss": 1.0173, + "mean_token_accuracy": 0.6907821893692017, + "num_tokens": 45312984.0, + "step": 1779 + }, + { + "epoch": 0.19547551065231716, + "grad_norm": 2.265080213546753, + "learning_rate": 9.763995609220637e-07, + "loss": 1.058, + "mean_token_accuracy": 0.6808831691741943, + "num_tokens": 45339021.0, + "step": 1780 + }, + { + "epoch": 0.1955853283549308, + "grad_norm": 2.148421287536621, + "learning_rate": 9.769484083424808e-07, + "loss": 1.0243, + "mean_token_accuracy": 0.6980892419815063, + "num_tokens": 45367913.0, + "step": 1781 + }, + { + "epoch": 0.19569514605754448, + "grad_norm": 2.3390138149261475, + "learning_rate": 9.774972557628979e-07, + "loss": 1.1385, + "mean_token_accuracy": 0.6640272736549377, + "num_tokens": 45392680.0, + "step": 1782 + }, + { + "epoch": 0.19580496376015813, + "grad_norm": 2.1678519248962402, + "learning_rate": 9.780461031833151e-07, + "loss": 1.0161, + "mean_token_accuracy": 0.6944005489349365, + "num_tokens": 45420020.0, + "step": 1783 + }, + { + "epoch": 0.1959147814627718, + "grad_norm": 2.627203941345215, + "learning_rate": 9.785949506037322e-07, + "loss": 0.988, + "mean_token_accuracy": 0.6973575949668884, + "num_tokens": 45440330.0, + "step": 1784 + }, + { + "epoch": 0.19602459916538545, + "grad_norm": 2.0756964683532715, + "learning_rate": 9.791437980241493e-07, + "loss": 0.9899, + "mean_token_accuracy": 0.7044427394866943, + "num_tokens": 45469153.0, + "step": 1785 + }, + { + "epoch": 0.19613441686799912, + "grad_norm": 2.512251377105713, + "learning_rate": 9.796926454445663e-07, + "loss": 1.0268, + "mean_token_accuracy": 0.6891014575958252, + "num_tokens": 45491253.0, + "step": 1786 + }, + { + "epoch": 0.1962442345706128, + "grad_norm": 2.5235512256622314, + "learning_rate": 9.802414928649834e-07, + "loss": 1.0292, + "mean_token_accuracy": 0.688581109046936, + "num_tokens": 45513665.0, + "step": 1787 + }, + { + "epoch": 0.19635405227322644, + "grad_norm": 2.2314560413360596, + "learning_rate": 9.807903402854007e-07, + "loss": 0.9362, + "mean_token_accuracy": 0.7240300178527832, + "num_tokens": 45538394.0, + "step": 1788 + }, + { + "epoch": 0.19646386997584012, + "grad_norm": 2.040489673614502, + "learning_rate": 9.813391877058177e-07, + "loss": 1.0574, + "mean_token_accuracy": 0.6857410669326782, + "num_tokens": 45570312.0, + "step": 1789 + }, + { + "epoch": 0.19657368767845376, + "grad_norm": 2.357863187789917, + "learning_rate": 9.818880351262348e-07, + "loss": 0.9165, + "mean_token_accuracy": 0.7175657749176025, + "num_tokens": 45592928.0, + "step": 1790 + }, + { + "epoch": 0.19668350538106744, + "grad_norm": 2.3531455993652344, + "learning_rate": 9.82436882546652e-07, + "loss": 1.0733, + "mean_token_accuracy": 0.6781719326972961, + "num_tokens": 45618184.0, + "step": 1791 + }, + { + "epoch": 0.19679332308368108, + "grad_norm": 2.2618722915649414, + "learning_rate": 9.829857299670691e-07, + "loss": 1.0068, + "mean_token_accuracy": 0.7013165950775146, + "num_tokens": 45643932.0, + "step": 1792 + }, + { + "epoch": 0.19690314078629476, + "grad_norm": 2.2969107627868652, + "learning_rate": 9.835345773874862e-07, + "loss": 0.971, + "mean_token_accuracy": 0.7056207060813904, + "num_tokens": 45668777.0, + "step": 1793 + }, + { + "epoch": 0.1970129584889084, + "grad_norm": 2.244861364364624, + "learning_rate": 9.840834248079035e-07, + "loss": 0.9963, + "mean_token_accuracy": 0.7110983729362488, + "num_tokens": 45697013.0, + "step": 1794 + }, + { + "epoch": 0.19712277619152208, + "grad_norm": 2.5583724975585938, + "learning_rate": 9.846322722283205e-07, + "loss": 0.9318, + "mean_token_accuracy": 0.7178381085395813, + "num_tokens": 45718144.0, + "step": 1795 + }, + { + "epoch": 0.19723259389413572, + "grad_norm": 2.332895278930664, + "learning_rate": 9.851811196487376e-07, + "loss": 1.0574, + "mean_token_accuracy": 0.6826266646385193, + "num_tokens": 45742216.0, + "step": 1796 + }, + { + "epoch": 0.1973424115967494, + "grad_norm": 2.5327365398406982, + "learning_rate": 9.857299670691546e-07, + "loss": 1.0383, + "mean_token_accuracy": 0.6847718954086304, + "num_tokens": 45762848.0, + "step": 1797 + }, + { + "epoch": 0.19745222929936307, + "grad_norm": 2.1913352012634277, + "learning_rate": 9.86278814489572e-07, + "loss": 0.9792, + "mean_token_accuracy": 0.7048035264015198, + "num_tokens": 45788658.0, + "step": 1798 + }, + { + "epoch": 0.19756204700197671, + "grad_norm": 2.0356040000915527, + "learning_rate": 9.86827661909989e-07, + "loss": 1.082, + "mean_token_accuracy": 0.6778355836868286, + "num_tokens": 45819722.0, + "step": 1799 + }, + { + "epoch": 0.1976718647045904, + "grad_norm": 2.2424261569976807, + "learning_rate": 9.87376509330406e-07, + "loss": 1.0042, + "mean_token_accuracy": 0.6955643892288208, + "num_tokens": 45847230.0, + "step": 1800 + }, + { + "epoch": 0.19778168240720403, + "grad_norm": 2.41615629196167, + "learning_rate": 9.879253567508233e-07, + "loss": 1.0125, + "mean_token_accuracy": 0.6946986317634583, + "num_tokens": 45871290.0, + "step": 1801 + }, + { + "epoch": 0.1978915001098177, + "grad_norm": 2.4835588932037354, + "learning_rate": 9.884742041712404e-07, + "loss": 0.9648, + "mean_token_accuracy": 0.7076998353004456, + "num_tokens": 45891361.0, + "step": 1802 + }, + { + "epoch": 0.19800131781243135, + "grad_norm": 2.257565498352051, + "learning_rate": 9.890230515916574e-07, + "loss": 1.0128, + "mean_token_accuracy": 0.6958199739456177, + "num_tokens": 45916930.0, + "step": 1803 + }, + { + "epoch": 0.19811113551504503, + "grad_norm": 2.4328243732452393, + "learning_rate": 9.895718990120747e-07, + "loss": 0.9454, + "mean_token_accuracy": 0.7116910219192505, + "num_tokens": 45940664.0, + "step": 1804 + }, + { + "epoch": 0.19822095321765867, + "grad_norm": 2.2717716693878174, + "learning_rate": 9.901207464324918e-07, + "loss": 0.9793, + "mean_token_accuracy": 0.7085970640182495, + "num_tokens": 45965410.0, + "step": 1805 + }, + { + "epoch": 0.19833077092027235, + "grad_norm": 2.2904114723205566, + "learning_rate": 9.906695938529088e-07, + "loss": 0.9582, + "mean_token_accuracy": 0.7161852717399597, + "num_tokens": 45990273.0, + "step": 1806 + }, + { + "epoch": 0.19844058862288602, + "grad_norm": 2.0856804847717285, + "learning_rate": 9.91218441273326e-07, + "loss": 0.9942, + "mean_token_accuracy": 0.6990795135498047, + "num_tokens": 46019949.0, + "step": 1807 + }, + { + "epoch": 0.19855040632549967, + "grad_norm": 2.368394374847412, + "learning_rate": 9.91767288693743e-07, + "loss": 1.1056, + "mean_token_accuracy": 0.6915774345397949, + "num_tokens": 46046639.0, + "step": 1808 + }, + { + "epoch": 0.19866022402811334, + "grad_norm": 2.5624401569366455, + "learning_rate": 9.923161361141602e-07, + "loss": 1.0397, + "mean_token_accuracy": 0.687825620174408, + "num_tokens": 46068424.0, + "step": 1809 + }, + { + "epoch": 0.198770041730727, + "grad_norm": 2.439033269882202, + "learning_rate": 9.928649835345773e-07, + "loss": 0.9924, + "mean_token_accuracy": 0.6964056491851807, + "num_tokens": 46093924.0, + "step": 1810 + }, + { + "epoch": 0.19887985943334066, + "grad_norm": 2.3646726608276367, + "learning_rate": 9.934138309549944e-07, + "loss": 1.0477, + "mean_token_accuracy": 0.6811033487319946, + "num_tokens": 46119562.0, + "step": 1811 + }, + { + "epoch": 0.1989896771359543, + "grad_norm": 2.52201509475708, + "learning_rate": 9.939626783754116e-07, + "loss": 0.9644, + "mean_token_accuracy": 0.7092045545578003, + "num_tokens": 46141341.0, + "step": 1812 + }, + { + "epoch": 0.19909949483856798, + "grad_norm": 2.2739787101745605, + "learning_rate": 9.945115257958287e-07, + "loss": 1.0711, + "mean_token_accuracy": 0.684067964553833, + "num_tokens": 46167012.0, + "step": 1813 + }, + { + "epoch": 0.19920931254118163, + "grad_norm": 2.310981512069702, + "learning_rate": 9.950603732162458e-07, + "loss": 0.9736, + "mean_token_accuracy": 0.7114216089248657, + "num_tokens": 46194207.0, + "step": 1814 + }, + { + "epoch": 0.1993191302437953, + "grad_norm": 2.240684986114502, + "learning_rate": 9.95609220636663e-07, + "loss": 1.1475, + "mean_token_accuracy": 0.6614152789115906, + "num_tokens": 46220884.0, + "step": 1815 + }, + { + "epoch": 0.19942894794640897, + "grad_norm": 2.3524820804595947, + "learning_rate": 9.9615806805708e-07, + "loss": 1.0357, + "mean_token_accuracy": 0.6920208930969238, + "num_tokens": 46244997.0, + "step": 1816 + }, + { + "epoch": 0.19953876564902262, + "grad_norm": 2.8725926876068115, + "learning_rate": 9.967069154774972e-07, + "loss": 0.9974, + "mean_token_accuracy": 0.7036059498786926, + "num_tokens": 46261662.0, + "step": 1817 + }, + { + "epoch": 0.1996485833516363, + "grad_norm": 2.1956634521484375, + "learning_rate": 9.972557628979144e-07, + "loss": 1.0849, + "mean_token_accuracy": 0.6822222471237183, + "num_tokens": 46294308.0, + "step": 1818 + }, + { + "epoch": 0.19975840105424994, + "grad_norm": 2.6471333503723145, + "learning_rate": 9.978046103183315e-07, + "loss": 0.9061, + "mean_token_accuracy": 0.7170181274414062, + "num_tokens": 46312864.0, + "step": 1819 + }, + { + "epoch": 0.1998682187568636, + "grad_norm": 2.1689960956573486, + "learning_rate": 9.983534577387486e-07, + "loss": 1.041, + "mean_token_accuracy": 0.6867758631706238, + "num_tokens": 46342256.0, + "step": 1820 + }, + { + "epoch": 0.19997803645947726, + "grad_norm": 2.273571014404297, + "learning_rate": 9.989023051591658e-07, + "loss": 1.0354, + "mean_token_accuracy": 0.6915568113327026, + "num_tokens": 46367345.0, + "step": 1821 + }, + { + "epoch": 0.20008785416209093, + "grad_norm": 1.9695574045181274, + "learning_rate": 9.994511525795829e-07, + "loss": 1.0939, + "mean_token_accuracy": 0.6705816984176636, + "num_tokens": 46399439.0, + "step": 1822 + }, + { + "epoch": 0.20019767186470458, + "grad_norm": 2.311911106109619, + "learning_rate": 1e-06, + "loss": 1.1367, + "mean_token_accuracy": 0.6648699045181274, + "num_tokens": 46424653.0, + "step": 1823 + }, + { + "epoch": 0.20030748956731825, + "grad_norm": 2.471803903579712, + "learning_rate": 1e-06, + "loss": 1.0349, + "mean_token_accuracy": 0.6989032030105591, + "num_tokens": 46446366.0, + "step": 1824 + }, + { + "epoch": 0.2004173072699319, + "grad_norm": 2.5937228202819824, + "learning_rate": 1e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.7181717157363892, + "num_tokens": 46465442.0, + "step": 1825 + }, + { + "epoch": 0.20052712497254557, + "grad_norm": 2.327658176422119, + "learning_rate": 1e-06, + "loss": 1.0859, + "mean_token_accuracy": 0.6805223226547241, + "num_tokens": 46492667.0, + "step": 1826 + }, + { + "epoch": 0.20063694267515925, + "grad_norm": 2.1520004272460938, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.6964447498321533, + "num_tokens": 46521256.0, + "step": 1827 + }, + { + "epoch": 0.2007467603777729, + "grad_norm": 2.3452084064483643, + "learning_rate": 1e-06, + "loss": 1.0244, + "mean_token_accuracy": 0.6945198178291321, + "num_tokens": 46545380.0, + "step": 1828 + }, + { + "epoch": 0.20085657808038657, + "grad_norm": 2.1560795307159424, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.71068274974823, + "num_tokens": 46572025.0, + "step": 1829 + }, + { + "epoch": 0.2009663957830002, + "grad_norm": 2.554758310317993, + "learning_rate": 1e-06, + "loss": 1.095, + "mean_token_accuracy": 0.674872100353241, + "num_tokens": 46595114.0, + "step": 1830 + }, + { + "epoch": 0.20107621348561389, + "grad_norm": 2.012479782104492, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7046884298324585, + "num_tokens": 46624709.0, + "step": 1831 + }, + { + "epoch": 0.20118603118822753, + "grad_norm": 2.4840264320373535, + "learning_rate": 1e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.7174144983291626, + "num_tokens": 46645546.0, + "step": 1832 + }, + { + "epoch": 0.2012958488908412, + "grad_norm": 2.277111530303955, + "learning_rate": 1e-06, + "loss": 1.0515, + "mean_token_accuracy": 0.689028263092041, + "num_tokens": 46672082.0, + "step": 1833 + }, + { + "epoch": 0.20140566659345485, + "grad_norm": 2.7179105281829834, + "learning_rate": 1e-06, + "loss": 0.9933, + "mean_token_accuracy": 0.7021313905715942, + "num_tokens": 46691253.0, + "step": 1834 + }, + { + "epoch": 0.20151548429606853, + "grad_norm": 2.231475353240967, + "learning_rate": 1e-06, + "loss": 1.0136, + "mean_token_accuracy": 0.7007499933242798, + "num_tokens": 46720218.0, + "step": 1835 + }, + { + "epoch": 0.2016253019986822, + "grad_norm": 2.6544299125671387, + "learning_rate": 1e-06, + "loss": 1.0017, + "mean_token_accuracy": 0.6996680498123169, + "num_tokens": 46739676.0, + "step": 1836 + }, + { + "epoch": 0.20173511970129585, + "grad_norm": 2.1055352687835693, + "learning_rate": 1e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.7023115754127502, + "num_tokens": 46767912.0, + "step": 1837 + }, + { + "epoch": 0.20184493740390952, + "grad_norm": 2.242220401763916, + "learning_rate": 1e-06, + "loss": 1.0862, + "mean_token_accuracy": 0.6952131986618042, + "num_tokens": 46794825.0, + "step": 1838 + }, + { + "epoch": 0.20195475510652316, + "grad_norm": 2.485947608947754, + "learning_rate": 1e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.7305927276611328, + "num_tokens": 46815100.0, + "step": 1839 + }, + { + "epoch": 0.20206457280913684, + "grad_norm": 2.6673903465270996, + "learning_rate": 1e-06, + "loss": 1.054, + "mean_token_accuracy": 0.6875131726264954, + "num_tokens": 46836320.0, + "step": 1840 + }, + { + "epoch": 0.20217439051175048, + "grad_norm": 2.4767839908599854, + "learning_rate": 1e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.6979014277458191, + "num_tokens": 46857355.0, + "step": 1841 + }, + { + "epoch": 0.20228420821436416, + "grad_norm": 2.3053035736083984, + "learning_rate": 1e-06, + "loss": 0.9863, + "mean_token_accuracy": 0.7096846103668213, + "num_tokens": 46880904.0, + "step": 1842 + }, + { + "epoch": 0.2023940259169778, + "grad_norm": 2.3351807594299316, + "learning_rate": 1e-06, + "loss": 0.9977, + "mean_token_accuracy": 0.7078403234481812, + "num_tokens": 46904933.0, + "step": 1843 + }, + { + "epoch": 0.20250384361959148, + "grad_norm": 2.3019027709960938, + "learning_rate": 1e-06, + "loss": 1.099, + "mean_token_accuracy": 0.6762427091598511, + "num_tokens": 46930825.0, + "step": 1844 + }, + { + "epoch": 0.20261366132220515, + "grad_norm": 2.446070671081543, + "learning_rate": 1e-06, + "loss": 1.0413, + "mean_token_accuracy": 0.6857598423957825, + "num_tokens": 46952496.0, + "step": 1845 + }, + { + "epoch": 0.2027234790248188, + "grad_norm": 2.028533697128296, + "learning_rate": 1e-06, + "loss": 1.0752, + "mean_token_accuracy": 0.6743870973587036, + "num_tokens": 46982088.0, + "step": 1846 + }, + { + "epoch": 0.20283329672743247, + "grad_norm": 2.091353416442871, + "learning_rate": 1e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.7143195867538452, + "num_tokens": 47008043.0, + "step": 1847 + }, + { + "epoch": 0.20294311443004612, + "grad_norm": 2.5624165534973145, + "learning_rate": 1e-06, + "loss": 1.0525, + "mean_token_accuracy": 0.6893385648727417, + "num_tokens": 47028879.0, + "step": 1848 + }, + { + "epoch": 0.2030529321326598, + "grad_norm": 2.4178552627563477, + "learning_rate": 1e-06, + "loss": 1.1103, + "mean_token_accuracy": 0.6712163686752319, + "num_tokens": 47053315.0, + "step": 1849 + }, + { + "epoch": 0.20316274983527344, + "grad_norm": 2.5124599933624268, + "learning_rate": 1e-06, + "loss": 1.0184, + "mean_token_accuracy": 0.6918601989746094, + "num_tokens": 47074497.0, + "step": 1850 + }, + { + "epoch": 0.2032725675378871, + "grad_norm": 2.513648748397827, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7188289165496826, + "num_tokens": 47095472.0, + "step": 1851 + }, + { + "epoch": 0.20338238524050076, + "grad_norm": 2.1085550785064697, + "learning_rate": 1e-06, + "loss": 1.0657, + "mean_token_accuracy": 0.6803346276283264, + "num_tokens": 47122208.0, + "step": 1852 + }, + { + "epoch": 0.20349220294311443, + "grad_norm": 2.1628639698028564, + "learning_rate": 1e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.6974505186080933, + "num_tokens": 47150686.0, + "step": 1853 + }, + { + "epoch": 0.2036020206457281, + "grad_norm": 2.2360103130340576, + "learning_rate": 1e-06, + "loss": 1.0687, + "mean_token_accuracy": 0.6795798540115356, + "num_tokens": 47178215.0, + "step": 1854 + }, + { + "epoch": 0.20371183834834175, + "grad_norm": 2.3534750938415527, + "learning_rate": 1e-06, + "loss": 1.0115, + "mean_token_accuracy": 0.6967285871505737, + "num_tokens": 47205714.0, + "step": 1855 + }, + { + "epoch": 0.20382165605095542, + "grad_norm": 2.0938382148742676, + "learning_rate": 1e-06, + "loss": 1.0976, + "mean_token_accuracy": 0.6739954352378845, + "num_tokens": 47234887.0, + "step": 1856 + }, + { + "epoch": 0.20393147375356907, + "grad_norm": 2.045886993408203, + "learning_rate": 1e-06, + "loss": 1.0927, + "mean_token_accuracy": 0.6851263046264648, + "num_tokens": 47267015.0, + "step": 1857 + }, + { + "epoch": 0.20404129145618274, + "grad_norm": 2.3380980491638184, + "learning_rate": 1e-06, + "loss": 1.0248, + "mean_token_accuracy": 0.6979753971099854, + "num_tokens": 47291302.0, + "step": 1858 + }, + { + "epoch": 0.2041511091587964, + "grad_norm": 2.4308156967163086, + "learning_rate": 1e-06, + "loss": 1.0495, + "mean_token_accuracy": 0.6828248500823975, + "num_tokens": 47316589.0, + "step": 1859 + }, + { + "epoch": 0.20426092686141006, + "grad_norm": 2.421875, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7140565514564514, + "num_tokens": 47339414.0, + "step": 1860 + }, + { + "epoch": 0.2043707445640237, + "grad_norm": 2.4443745613098145, + "learning_rate": 1e-06, + "loss": 1.0454, + "mean_token_accuracy": 0.6914085149765015, + "num_tokens": 47362127.0, + "step": 1861 + }, + { + "epoch": 0.20448056226663738, + "grad_norm": 2.4243228435516357, + "learning_rate": 1e-06, + "loss": 1.0787, + "mean_token_accuracy": 0.6774023175239563, + "num_tokens": 47386233.0, + "step": 1862 + }, + { + "epoch": 0.20459037996925103, + "grad_norm": 2.364138603210449, + "learning_rate": 1e-06, + "loss": 1.0076, + "mean_token_accuracy": 0.7074751853942871, + "num_tokens": 47410111.0, + "step": 1863 + }, + { + "epoch": 0.2047001976718647, + "grad_norm": 2.11333966255188, + "learning_rate": 1e-06, + "loss": 1.0091, + "mean_token_accuracy": 0.6969994306564331, + "num_tokens": 47440661.0, + "step": 1864 + }, + { + "epoch": 0.20481001537447838, + "grad_norm": 2.0591938495635986, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7111263871192932, + "num_tokens": 47468745.0, + "step": 1865 + }, + { + "epoch": 0.20491983307709202, + "grad_norm": 2.3719499111175537, + "learning_rate": 1e-06, + "loss": 1.0877, + "mean_token_accuracy": 0.6746224761009216, + "num_tokens": 47492536.0, + "step": 1866 + }, + { + "epoch": 0.2050296507797057, + "grad_norm": 2.7315969467163086, + "learning_rate": 1e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.7035555243492126, + "num_tokens": 47510469.0, + "step": 1867 + }, + { + "epoch": 0.20513946848231934, + "grad_norm": 2.2649762630462646, + "learning_rate": 1e-06, + "loss": 1.0149, + "mean_token_accuracy": 0.6987234354019165, + "num_tokens": 47537152.0, + "step": 1868 + }, + { + "epoch": 0.20524928618493302, + "grad_norm": 2.2384707927703857, + "learning_rate": 1e-06, + "loss": 0.9809, + "mean_token_accuracy": 0.6996806263923645, + "num_tokens": 47561130.0, + "step": 1869 + }, + { + "epoch": 0.20535910388754666, + "grad_norm": 2.209228515625, + "learning_rate": 1e-06, + "loss": 1.0306, + "mean_token_accuracy": 0.6920972466468811, + "num_tokens": 47590817.0, + "step": 1870 + }, + { + "epoch": 0.20546892159016034, + "grad_norm": 2.351917028427124, + "learning_rate": 1e-06, + "loss": 1.0095, + "mean_token_accuracy": 0.6961469650268555, + "num_tokens": 47617181.0, + "step": 1871 + }, + { + "epoch": 0.20557873929277398, + "grad_norm": 2.4117226600646973, + "learning_rate": 1e-06, + "loss": 1.0352, + "mean_token_accuracy": 0.6859129071235657, + "num_tokens": 47641072.0, + "step": 1872 + }, + { + "epoch": 0.20568855699538766, + "grad_norm": 2.0837855339050293, + "learning_rate": 1e-06, + "loss": 1.0739, + "mean_token_accuracy": 0.6766780614852905, + "num_tokens": 47673390.0, + "step": 1873 + }, + { + "epoch": 0.20579837469800133, + "grad_norm": 2.0287601947784424, + "learning_rate": 1e-06, + "loss": 1.0135, + "mean_token_accuracy": 0.6966858506202698, + "num_tokens": 47705902.0, + "step": 1874 + }, + { + "epoch": 0.20590819240061498, + "grad_norm": 2.1748456954956055, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.7101460695266724, + "num_tokens": 47733803.0, + "step": 1875 + }, + { + "epoch": 0.20601801010322865, + "grad_norm": 2.202562093734741, + "learning_rate": 1e-06, + "loss": 1.0442, + "mean_token_accuracy": 0.6832212805747986, + "num_tokens": 47761792.0, + "step": 1876 + }, + { + "epoch": 0.2061278278058423, + "grad_norm": 2.551387310028076, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7159050703048706, + "num_tokens": 47781968.0, + "step": 1877 + }, + { + "epoch": 0.20623764550845597, + "grad_norm": 2.06439208984375, + "learning_rate": 1e-06, + "loss": 1.1014, + "mean_token_accuracy": 0.6706926822662354, + "num_tokens": 47813259.0, + "step": 1878 + }, + { + "epoch": 0.20634746321106961, + "grad_norm": 2.6533448696136475, + "learning_rate": 1e-06, + "loss": 1.0298, + "mean_token_accuracy": 0.6867350339889526, + "num_tokens": 47836620.0, + "step": 1879 + }, + { + "epoch": 0.2064572809136833, + "grad_norm": 2.232316255569458, + "learning_rate": 1e-06, + "loss": 0.9912, + "mean_token_accuracy": 0.7031101584434509, + "num_tokens": 47861288.0, + "step": 1880 + }, + { + "epoch": 0.20656709861629693, + "grad_norm": 2.160526990890503, + "learning_rate": 1e-06, + "loss": 1.1013, + "mean_token_accuracy": 0.6704223155975342, + "num_tokens": 47890122.0, + "step": 1881 + }, + { + "epoch": 0.2066769163189106, + "grad_norm": 2.3054001331329346, + "learning_rate": 1e-06, + "loss": 1.051, + "mean_token_accuracy": 0.6879686713218689, + "num_tokens": 47915680.0, + "step": 1882 + }, + { + "epoch": 0.20678673402152428, + "grad_norm": 2.2656116485595703, + "learning_rate": 1e-06, + "loss": 1.0117, + "mean_token_accuracy": 0.6970322728157043, + "num_tokens": 47943248.0, + "step": 1883 + }, + { + "epoch": 0.20689655172413793, + "grad_norm": 2.3955330848693848, + "learning_rate": 1e-06, + "loss": 1.0773, + "mean_token_accuracy": 0.6800252199172974, + "num_tokens": 47965799.0, + "step": 1884 + }, + { + "epoch": 0.2070063694267516, + "grad_norm": 2.1820595264434814, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7226540446281433, + "num_tokens": 47991468.0, + "step": 1885 + }, + { + "epoch": 0.20711618712936525, + "grad_norm": 2.140744209289551, + "learning_rate": 1e-06, + "loss": 1.0291, + "mean_token_accuracy": 0.6880536079406738, + "num_tokens": 48020999.0, + "step": 1886 + }, + { + "epoch": 0.20722600483197892, + "grad_norm": 2.319838523864746, + "learning_rate": 1e-06, + "loss": 1.024, + "mean_token_accuracy": 0.6958667039871216, + "num_tokens": 48043630.0, + "step": 1887 + }, + { + "epoch": 0.20733582253459257, + "grad_norm": 2.280912399291992, + "learning_rate": 1e-06, + "loss": 1.0438, + "mean_token_accuracy": 0.685396671295166, + "num_tokens": 48069819.0, + "step": 1888 + }, + { + "epoch": 0.20744564023720624, + "grad_norm": 2.7122268676757812, + "learning_rate": 1e-06, + "loss": 1.0274, + "mean_token_accuracy": 0.6989681720733643, + "num_tokens": 48091481.0, + "step": 1889 + }, + { + "epoch": 0.2075554579398199, + "grad_norm": 2.149678945541382, + "learning_rate": 1e-06, + "loss": 1.0401, + "mean_token_accuracy": 0.6897359490394592, + "num_tokens": 48119366.0, + "step": 1890 + }, + { + "epoch": 0.20766527564243356, + "grad_norm": 2.1208200454711914, + "learning_rate": 1e-06, + "loss": 1.134, + "mean_token_accuracy": 0.6663587093353271, + "num_tokens": 48151661.0, + "step": 1891 + }, + { + "epoch": 0.20777509334504723, + "grad_norm": 2.5314815044403076, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7198130488395691, + "num_tokens": 48172585.0, + "step": 1892 + }, + { + "epoch": 0.20788491104766088, + "grad_norm": 2.2846968173980713, + "learning_rate": 1e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.6924805641174316, + "num_tokens": 48198956.0, + "step": 1893 + }, + { + "epoch": 0.20799472875027455, + "grad_norm": 2.1765975952148438, + "learning_rate": 1e-06, + "loss": 1.0574, + "mean_token_accuracy": 0.6767698526382446, + "num_tokens": 48225665.0, + "step": 1894 + }, + { + "epoch": 0.2081045464528882, + "grad_norm": 2.309967517852783, + "learning_rate": 1e-06, + "loss": 1.0777, + "mean_token_accuracy": 0.6794991493225098, + "num_tokens": 48250388.0, + "step": 1895 + }, + { + "epoch": 0.20821436415550187, + "grad_norm": 2.2817983627319336, + "learning_rate": 1e-06, + "loss": 1.0334, + "mean_token_accuracy": 0.6829582452774048, + "num_tokens": 48275403.0, + "step": 1896 + }, + { + "epoch": 0.20832418185811552, + "grad_norm": 2.8285934925079346, + "learning_rate": 1e-06, + "loss": 0.9209, + "mean_token_accuracy": 0.7167243957519531, + "num_tokens": 48292381.0, + "step": 1897 + }, + { + "epoch": 0.2084339995607292, + "grad_norm": 2.714754104614258, + "learning_rate": 1e-06, + "loss": 1.0347, + "mean_token_accuracy": 0.6942762732505798, + "num_tokens": 48312110.0, + "step": 1898 + }, + { + "epoch": 0.20854381726334284, + "grad_norm": 2.456406831741333, + "learning_rate": 1e-06, + "loss": 1.0787, + "mean_token_accuracy": 0.6817563772201538, + "num_tokens": 48336110.0, + "step": 1899 + }, + { + "epoch": 0.2086536349659565, + "grad_norm": 2.519508123397827, + "learning_rate": 1e-06, + "loss": 1.1014, + "mean_token_accuracy": 0.6841234564781189, + "num_tokens": 48358491.0, + "step": 1900 + }, + { + "epoch": 0.20876345266857016, + "grad_norm": 1.9899773597717285, + "learning_rate": 1e-06, + "loss": 1.0542, + "mean_token_accuracy": 0.6851367354393005, + "num_tokens": 48389249.0, + "step": 1901 + }, + { + "epoch": 0.20887327037118383, + "grad_norm": 2.026850700378418, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.7096489071846008, + "num_tokens": 48419366.0, + "step": 1902 + }, + { + "epoch": 0.2089830880737975, + "grad_norm": 2.499530553817749, + "learning_rate": 1e-06, + "loss": 1.0159, + "mean_token_accuracy": 0.6937968134880066, + "num_tokens": 48439636.0, + "step": 1903 + }, + { + "epoch": 0.20909290577641115, + "grad_norm": 2.1189193725585938, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.6965630650520325, + "num_tokens": 48466506.0, + "step": 1904 + }, + { + "epoch": 0.20920272347902483, + "grad_norm": 2.3787927627563477, + "learning_rate": 1e-06, + "loss": 0.9082, + "mean_token_accuracy": 0.7225466966629028, + "num_tokens": 48489825.0, + "step": 1905 + }, + { + "epoch": 0.20931254118163847, + "grad_norm": 2.3576457500457764, + "learning_rate": 1e-06, + "loss": 0.9863, + "mean_token_accuracy": 0.6980785131454468, + "num_tokens": 48512036.0, + "step": 1906 + }, + { + "epoch": 0.20942235888425215, + "grad_norm": 2.39109468460083, + "learning_rate": 1e-06, + "loss": 1.007, + "mean_token_accuracy": 0.6945667266845703, + "num_tokens": 48534114.0, + "step": 1907 + }, + { + "epoch": 0.2095321765868658, + "grad_norm": 2.4854607582092285, + "learning_rate": 1e-06, + "loss": 1.0889, + "mean_token_accuracy": 0.6819785833358765, + "num_tokens": 48555863.0, + "step": 1908 + }, + { + "epoch": 0.20964199428947947, + "grad_norm": 2.262483835220337, + "learning_rate": 1e-06, + "loss": 1.0392, + "mean_token_accuracy": 0.6900863647460938, + "num_tokens": 48580385.0, + "step": 1909 + }, + { + "epoch": 0.2097518119920931, + "grad_norm": 2.4272754192352295, + "learning_rate": 1e-06, + "loss": 1.027, + "mean_token_accuracy": 0.6898348927497864, + "num_tokens": 48605450.0, + "step": 1910 + }, + { + "epoch": 0.20986162969470679, + "grad_norm": 2.0113487243652344, + "learning_rate": 1e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.7181375622749329, + "num_tokens": 48633966.0, + "step": 1911 + }, + { + "epoch": 0.20997144739732046, + "grad_norm": 2.515557050704956, + "learning_rate": 1e-06, + "loss": 0.8604, + "mean_token_accuracy": 0.7339577078819275, + "num_tokens": 48652970.0, + "step": 1912 + }, + { + "epoch": 0.2100812650999341, + "grad_norm": 2.473635673522949, + "learning_rate": 1e-06, + "loss": 1.0308, + "mean_token_accuracy": 0.6898924708366394, + "num_tokens": 48676840.0, + "step": 1913 + }, + { + "epoch": 0.21019108280254778, + "grad_norm": 2.1369717121124268, + "learning_rate": 1e-06, + "loss": 1.0028, + "mean_token_accuracy": 0.6987172365188599, + "num_tokens": 48705727.0, + "step": 1914 + }, + { + "epoch": 0.21030090050516143, + "grad_norm": 2.329319477081299, + "learning_rate": 1e-06, + "loss": 1.0191, + "mean_token_accuracy": 0.6954469084739685, + "num_tokens": 48730312.0, + "step": 1915 + }, + { + "epoch": 0.2104107182077751, + "grad_norm": 2.5129449367523193, + "learning_rate": 1e-06, + "loss": 0.8915, + "mean_token_accuracy": 0.7273863554000854, + "num_tokens": 48750133.0, + "step": 1916 + }, + { + "epoch": 0.21052053591038875, + "grad_norm": 2.276545763015747, + "learning_rate": 1e-06, + "loss": 1.079, + "mean_token_accuracy": 0.6762757301330566, + "num_tokens": 48774711.0, + "step": 1917 + }, + { + "epoch": 0.21063035361300242, + "grad_norm": 2.5095736980438232, + "learning_rate": 1e-06, + "loss": 1.0101, + "mean_token_accuracy": 0.6981823444366455, + "num_tokens": 48796250.0, + "step": 1918 + }, + { + "epoch": 0.21074017131561606, + "grad_norm": 2.590902090072632, + "learning_rate": 1e-06, + "loss": 1.0477, + "mean_token_accuracy": 0.6871325373649597, + "num_tokens": 48816915.0, + "step": 1919 + }, + { + "epoch": 0.21084998901822974, + "grad_norm": 2.126894950866699, + "learning_rate": 1e-06, + "loss": 1.0274, + "mean_token_accuracy": 0.6893333196640015, + "num_tokens": 48848080.0, + "step": 1920 + }, + { + "epoch": 0.2109598067208434, + "grad_norm": 2.0448684692382812, + "learning_rate": 1e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.6911464333534241, + "num_tokens": 48875710.0, + "step": 1921 + }, + { + "epoch": 0.21106962442345706, + "grad_norm": 2.2984085083007812, + "learning_rate": 1e-06, + "loss": 1.0165, + "mean_token_accuracy": 0.692615270614624, + "num_tokens": 48902221.0, + "step": 1922 + }, + { + "epoch": 0.21117944212607073, + "grad_norm": 2.1140270233154297, + "learning_rate": 1e-06, + "loss": 0.9657, + "mean_token_accuracy": 0.7163800001144409, + "num_tokens": 48929524.0, + "step": 1923 + }, + { + "epoch": 0.21128925982868438, + "grad_norm": 2.302901029586792, + "learning_rate": 1e-06, + "loss": 1.0181, + "mean_token_accuracy": 0.696426272392273, + "num_tokens": 48953424.0, + "step": 1924 + }, + { + "epoch": 0.21139907753129805, + "grad_norm": 2.2815065383911133, + "learning_rate": 1e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.7105683088302612, + "num_tokens": 48979936.0, + "step": 1925 + }, + { + "epoch": 0.2115088952339117, + "grad_norm": 2.4265544414520264, + "learning_rate": 1e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.7100645303726196, + "num_tokens": 49002704.0, + "step": 1926 + }, + { + "epoch": 0.21161871293652537, + "grad_norm": 2.0555474758148193, + "learning_rate": 1e-06, + "loss": 1.0651, + "mean_token_accuracy": 0.6809419989585876, + "num_tokens": 49033675.0, + "step": 1927 + }, + { + "epoch": 0.21172853063913902, + "grad_norm": 2.25212025642395, + "learning_rate": 1e-06, + "loss": 1.0377, + "mean_token_accuracy": 0.6879836320877075, + "num_tokens": 49059323.0, + "step": 1928 + }, + { + "epoch": 0.2118383483417527, + "grad_norm": 2.3991000652313232, + "learning_rate": 1e-06, + "loss": 1.0687, + "mean_token_accuracy": 0.6805965900421143, + "num_tokens": 49083413.0, + "step": 1929 + }, + { + "epoch": 0.21194816604436637, + "grad_norm": 2.28230619430542, + "learning_rate": 1e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.7089748382568359, + "num_tokens": 49107979.0, + "step": 1930 + }, + { + "epoch": 0.21205798374698, + "grad_norm": 2.407106399536133, + "learning_rate": 1e-06, + "loss": 0.851, + "mean_token_accuracy": 0.7334938049316406, + "num_tokens": 49128710.0, + "step": 1931 + }, + { + "epoch": 0.21216780144959368, + "grad_norm": 2.3664071559906006, + "learning_rate": 1e-06, + "loss": 1.0169, + "mean_token_accuracy": 0.6964482069015503, + "num_tokens": 49154041.0, + "step": 1932 + }, + { + "epoch": 0.21227761915220733, + "grad_norm": 2.3221981525421143, + "learning_rate": 1e-06, + "loss": 1.0966, + "mean_token_accuracy": 0.6764758825302124, + "num_tokens": 49178535.0, + "step": 1933 + }, + { + "epoch": 0.212387436854821, + "grad_norm": 2.0296881198883057, + "learning_rate": 1e-06, + "loss": 1.0351, + "mean_token_accuracy": 0.6900644302368164, + "num_tokens": 49210214.0, + "step": 1934 + }, + { + "epoch": 0.21249725455743465, + "grad_norm": 2.4845855236053467, + "learning_rate": 1e-06, + "loss": 0.8626, + "mean_token_accuracy": 0.7325617074966431, + "num_tokens": 49229461.0, + "step": 1935 + }, + { + "epoch": 0.21260707226004832, + "grad_norm": 2.436516284942627, + "learning_rate": 1e-06, + "loss": 1.0165, + "mean_token_accuracy": 0.6982918977737427, + "num_tokens": 49251108.0, + "step": 1936 + }, + { + "epoch": 0.21271688996266197, + "grad_norm": 2.060933828353882, + "learning_rate": 1e-06, + "loss": 1.0273, + "mean_token_accuracy": 0.6956087946891785, + "num_tokens": 49282571.0, + "step": 1937 + }, + { + "epoch": 0.21282670766527564, + "grad_norm": 2.1709587574005127, + "learning_rate": 1e-06, + "loss": 1.1077, + "mean_token_accuracy": 0.6729236841201782, + "num_tokens": 49312542.0, + "step": 1938 + }, + { + "epoch": 0.2129365253678893, + "grad_norm": 1.9191261529922485, + "learning_rate": 1e-06, + "loss": 1.0504, + "mean_token_accuracy": 0.6823369264602661, + "num_tokens": 49348965.0, + "step": 1939 + }, + { + "epoch": 0.21304634307050296, + "grad_norm": 2.509204387664795, + "learning_rate": 1e-06, + "loss": 1.0566, + "mean_token_accuracy": 0.6864333748817444, + "num_tokens": 49370132.0, + "step": 1940 + }, + { + "epoch": 0.21315616077311664, + "grad_norm": 2.1410410404205322, + "learning_rate": 1e-06, + "loss": 1.0433, + "mean_token_accuracy": 0.6938194632530212, + "num_tokens": 49399954.0, + "step": 1941 + }, + { + "epoch": 0.21326597847573028, + "grad_norm": 2.0717127323150635, + "learning_rate": 1e-06, + "loss": 1.0187, + "mean_token_accuracy": 0.691328763961792, + "num_tokens": 49431694.0, + "step": 1942 + }, + { + "epoch": 0.21337579617834396, + "grad_norm": 2.381588935852051, + "learning_rate": 1e-06, + "loss": 1.009, + "mean_token_accuracy": 0.6983271837234497, + "num_tokens": 49455968.0, + "step": 1943 + }, + { + "epoch": 0.2134856138809576, + "grad_norm": 2.1094725131988525, + "learning_rate": 1e-06, + "loss": 1.1834, + "mean_token_accuracy": 0.6558998823165894, + "num_tokens": 49486157.0, + "step": 1944 + }, + { + "epoch": 0.21359543158357128, + "grad_norm": 2.1697568893432617, + "learning_rate": 1e-06, + "loss": 1.0832, + "mean_token_accuracy": 0.6791124939918518, + "num_tokens": 49516397.0, + "step": 1945 + }, + { + "epoch": 0.21370524928618492, + "grad_norm": 2.5253307819366455, + "learning_rate": 1e-06, + "loss": 0.9936, + "mean_token_accuracy": 0.7095369696617126, + "num_tokens": 49538580.0, + "step": 1946 + }, + { + "epoch": 0.2138150669887986, + "grad_norm": 2.1016457080841064, + "learning_rate": 1e-06, + "loss": 1.1331, + "mean_token_accuracy": 0.6621771454811096, + "num_tokens": 49571157.0, + "step": 1947 + }, + { + "epoch": 0.21392488469141224, + "grad_norm": 2.009505033493042, + "learning_rate": 1e-06, + "loss": 0.9206, + "mean_token_accuracy": 0.7147212624549866, + "num_tokens": 49600338.0, + "step": 1948 + }, + { + "epoch": 0.21403470239402592, + "grad_norm": 2.393479347229004, + "learning_rate": 1e-06, + "loss": 1.1197, + "mean_token_accuracy": 0.665485680103302, + "num_tokens": 49623371.0, + "step": 1949 + }, + { + "epoch": 0.2141445200966396, + "grad_norm": 2.095012903213501, + "learning_rate": 1e-06, + "loss": 0.9868, + "mean_token_accuracy": 0.7032893896102905, + "num_tokens": 49650247.0, + "step": 1950 + }, + { + "epoch": 0.21425433779925324, + "grad_norm": 2.2020199298858643, + "learning_rate": 1e-06, + "loss": 1.0261, + "mean_token_accuracy": 0.6895933747291565, + "num_tokens": 49676781.0, + "step": 1951 + }, + { + "epoch": 0.2143641555018669, + "grad_norm": 2.3800668716430664, + "learning_rate": 1e-06, + "loss": 1.081, + "mean_token_accuracy": 0.6752529144287109, + "num_tokens": 49701026.0, + "step": 1952 + }, + { + "epoch": 0.21447397320448056, + "grad_norm": 2.1826493740081787, + "learning_rate": 1e-06, + "loss": 1.0545, + "mean_token_accuracy": 0.6961292028427124, + "num_tokens": 49726364.0, + "step": 1953 + }, + { + "epoch": 0.21458379090709423, + "grad_norm": 2.6658596992492676, + "learning_rate": 1e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.6975447535514832, + "num_tokens": 49744627.0, + "step": 1954 + }, + { + "epoch": 0.21469360860970788, + "grad_norm": 2.7099716663360596, + "learning_rate": 1e-06, + "loss": 1.0052, + "mean_token_accuracy": 0.7133839130401611, + "num_tokens": 49764165.0, + "step": 1955 + }, + { + "epoch": 0.21480342631232155, + "grad_norm": 2.2834646701812744, + "learning_rate": 1e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.7207927703857422, + "num_tokens": 49789500.0, + "step": 1956 + }, + { + "epoch": 0.2149132440149352, + "grad_norm": 2.1510372161865234, + "learning_rate": 1e-06, + "loss": 1.0752, + "mean_token_accuracy": 0.677037239074707, + "num_tokens": 49818249.0, + "step": 1957 + }, + { + "epoch": 0.21502306171754887, + "grad_norm": 2.3509268760681152, + "learning_rate": 1e-06, + "loss": 1.1211, + "mean_token_accuracy": 0.6654765009880066, + "num_tokens": 49842362.0, + "step": 1958 + }, + { + "epoch": 0.21513287942016254, + "grad_norm": 2.5239009857177734, + "learning_rate": 1e-06, + "loss": 1.0113, + "mean_token_accuracy": 0.7034370303153992, + "num_tokens": 49864021.0, + "step": 1959 + }, + { + "epoch": 0.2152426971227762, + "grad_norm": 2.0809197425842285, + "learning_rate": 1e-06, + "loss": 1.0255, + "mean_token_accuracy": 0.6871123313903809, + "num_tokens": 49892691.0, + "step": 1960 + }, + { + "epoch": 0.21535251482538986, + "grad_norm": 2.1437835693359375, + "learning_rate": 1e-06, + "loss": 1.0728, + "mean_token_accuracy": 0.6756577491760254, + "num_tokens": 49921515.0, + "step": 1961 + }, + { + "epoch": 0.2154623325280035, + "grad_norm": 2.1241347789764404, + "learning_rate": 1e-06, + "loss": 1.0764, + "mean_token_accuracy": 0.6777942776679993, + "num_tokens": 49950377.0, + "step": 1962 + }, + { + "epoch": 0.21557215023061718, + "grad_norm": 2.4960217475891113, + "learning_rate": 1e-06, + "loss": 1.0435, + "mean_token_accuracy": 0.6862612366676331, + "num_tokens": 49974198.0, + "step": 1963 + }, + { + "epoch": 0.21568196793323083, + "grad_norm": 2.0469348430633545, + "learning_rate": 1e-06, + "loss": 0.9587, + "mean_token_accuracy": 0.7082263231277466, + "num_tokens": 50001588.0, + "step": 1964 + }, + { + "epoch": 0.2157917856358445, + "grad_norm": 2.0523135662078857, + "learning_rate": 1e-06, + "loss": 1.0453, + "mean_token_accuracy": 0.6941221356391907, + "num_tokens": 50029902.0, + "step": 1965 + }, + { + "epoch": 0.21590160333845815, + "grad_norm": 2.0878686904907227, + "learning_rate": 1e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.6940515637397766, + "num_tokens": 50057530.0, + "step": 1966 + }, + { + "epoch": 0.21601142104107182, + "grad_norm": 2.3789889812469482, + "learning_rate": 1e-06, + "loss": 0.9657, + "mean_token_accuracy": 0.702671468257904, + "num_tokens": 50080796.0, + "step": 1967 + }, + { + "epoch": 0.2161212387436855, + "grad_norm": 2.4908018112182617, + "learning_rate": 1e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.7045761346817017, + "num_tokens": 50102039.0, + "step": 1968 + }, + { + "epoch": 0.21623105644629914, + "grad_norm": 2.2364771366119385, + "learning_rate": 1e-06, + "loss": 1.0028, + "mean_token_accuracy": 0.6975528001785278, + "num_tokens": 50128490.0, + "step": 1969 + }, + { + "epoch": 0.21634087414891282, + "grad_norm": 2.452584981918335, + "learning_rate": 1e-06, + "loss": 1.017, + "mean_token_accuracy": 0.6887911558151245, + "num_tokens": 50150654.0, + "step": 1970 + }, + { + "epoch": 0.21645069185152646, + "grad_norm": 2.4045250415802, + "learning_rate": 1e-06, + "loss": 1.0118, + "mean_token_accuracy": 0.6942392587661743, + "num_tokens": 50173208.0, + "step": 1971 + }, + { + "epoch": 0.21656050955414013, + "grad_norm": 2.1975889205932617, + "learning_rate": 1e-06, + "loss": 1.0349, + "mean_token_accuracy": 0.6897829174995422, + "num_tokens": 50198927.0, + "step": 1972 + }, + { + "epoch": 0.21667032725675378, + "grad_norm": 2.5510668754577637, + "learning_rate": 1e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.6985907554626465, + "num_tokens": 50219930.0, + "step": 1973 + }, + { + "epoch": 0.21678014495936745, + "grad_norm": 2.189610242843628, + "learning_rate": 1e-06, + "loss": 1.069, + "mean_token_accuracy": 0.6752091646194458, + "num_tokens": 50246664.0, + "step": 1974 + }, + { + "epoch": 0.2168899626619811, + "grad_norm": 2.2110989093780518, + "learning_rate": 1e-06, + "loss": 1.0534, + "mean_token_accuracy": 0.6887801289558411, + "num_tokens": 50272061.0, + "step": 1975 + }, + { + "epoch": 0.21699978036459477, + "grad_norm": 2.1093029975891113, + "learning_rate": 1e-06, + "loss": 0.9266, + "mean_token_accuracy": 0.7143308520317078, + "num_tokens": 50297587.0, + "step": 1976 + }, + { + "epoch": 0.21710959806720842, + "grad_norm": 1.9740111827850342, + "learning_rate": 1e-06, + "loss": 1.0728, + "mean_token_accuracy": 0.6828666925430298, + "num_tokens": 50332035.0, + "step": 1977 + }, + { + "epoch": 0.2172194157698221, + "grad_norm": 2.081838846206665, + "learning_rate": 1e-06, + "loss": 1.0363, + "mean_token_accuracy": 0.6884311437606812, + "num_tokens": 50360028.0, + "step": 1978 + }, + { + "epoch": 0.21732923347243577, + "grad_norm": 2.1878809928894043, + "learning_rate": 1e-06, + "loss": 1.0689, + "mean_token_accuracy": 0.6778382658958435, + "num_tokens": 50389635.0, + "step": 1979 + }, + { + "epoch": 0.2174390511750494, + "grad_norm": 2.2353994846343994, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.7023717761039734, + "num_tokens": 50414493.0, + "step": 1980 + }, + { + "epoch": 0.2175488688776631, + "grad_norm": 2.2959749698638916, + "learning_rate": 1e-06, + "loss": 1.0471, + "mean_token_accuracy": 0.6890664100646973, + "num_tokens": 50440743.0, + "step": 1981 + }, + { + "epoch": 0.21765868658027673, + "grad_norm": 2.274635076522827, + "learning_rate": 1e-06, + "loss": 1.082, + "mean_token_accuracy": 0.6740272641181946, + "num_tokens": 50466672.0, + "step": 1982 + }, + { + "epoch": 0.2177685042828904, + "grad_norm": 2.1138229370117188, + "learning_rate": 1e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.7112174034118652, + "num_tokens": 50498245.0, + "step": 1983 + }, + { + "epoch": 0.21787832198550405, + "grad_norm": 2.512376070022583, + "learning_rate": 1e-06, + "loss": 1.0027, + "mean_token_accuracy": 0.6966487169265747, + "num_tokens": 50520819.0, + "step": 1984 + }, + { + "epoch": 0.21798813968811773, + "grad_norm": 1.9340527057647705, + "learning_rate": 1e-06, + "loss": 1.0217, + "mean_token_accuracy": 0.6903910040855408, + "num_tokens": 50554091.0, + "step": 1985 + }, + { + "epoch": 0.21809795739073137, + "grad_norm": 2.696923017501831, + "learning_rate": 1e-06, + "loss": 0.9348, + "mean_token_accuracy": 0.7158451676368713, + "num_tokens": 50571511.0, + "step": 1986 + }, + { + "epoch": 0.21820777509334505, + "grad_norm": 2.318272829055786, + "learning_rate": 1e-06, + "loss": 1.0537, + "mean_token_accuracy": 0.6802971363067627, + "num_tokens": 50596325.0, + "step": 1987 + }, + { + "epoch": 0.21831759279595872, + "grad_norm": 2.5126540660858154, + "learning_rate": 1e-06, + "loss": 1.0306, + "mean_token_accuracy": 0.6932047605514526, + "num_tokens": 50619696.0, + "step": 1988 + }, + { + "epoch": 0.21842741049857237, + "grad_norm": 2.313203811645508, + "learning_rate": 1e-06, + "loss": 1.001, + "mean_token_accuracy": 0.6951361894607544, + "num_tokens": 50646662.0, + "step": 1989 + }, + { + "epoch": 0.21853722820118604, + "grad_norm": 2.7441043853759766, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7050741314888, + "num_tokens": 50665874.0, + "step": 1990 + }, + { + "epoch": 0.21864704590379969, + "grad_norm": 2.0319907665252686, + "learning_rate": 1e-06, + "loss": 1.0288, + "mean_token_accuracy": 0.694024384021759, + "num_tokens": 50697198.0, + "step": 1991 + }, + { + "epoch": 0.21875686360641336, + "grad_norm": 2.5210278034210205, + "learning_rate": 1e-06, + "loss": 1.0565, + "mean_token_accuracy": 0.68580162525177, + "num_tokens": 50720611.0, + "step": 1992 + }, + { + "epoch": 0.218866681309027, + "grad_norm": 2.847841501235962, + "learning_rate": 1e-06, + "loss": 0.9224, + "mean_token_accuracy": 0.7150675654411316, + "num_tokens": 50738254.0, + "step": 1993 + }, + { + "epoch": 0.21897649901164068, + "grad_norm": 2.2370452880859375, + "learning_rate": 1e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.7076108455657959, + "num_tokens": 50763396.0, + "step": 1994 + }, + { + "epoch": 0.21908631671425433, + "grad_norm": 2.008364200592041, + "learning_rate": 1e-06, + "loss": 1.0569, + "mean_token_accuracy": 0.6995313167572021, + "num_tokens": 50794681.0, + "step": 1995 + }, + { + "epoch": 0.219196134416868, + "grad_norm": 2.25443696975708, + "learning_rate": 1e-06, + "loss": 1.0521, + "mean_token_accuracy": 0.6819726824760437, + "num_tokens": 50821494.0, + "step": 1996 + }, + { + "epoch": 0.21930595211948167, + "grad_norm": 2.280733823776245, + "learning_rate": 1e-06, + "loss": 1.0424, + "mean_token_accuracy": 0.6888270378112793, + "num_tokens": 50847713.0, + "step": 1997 + }, + { + "epoch": 0.21941576982209532, + "grad_norm": 2.2587804794311523, + "learning_rate": 1e-06, + "loss": 0.9279, + "mean_token_accuracy": 0.7180658578872681, + "num_tokens": 50873539.0, + "step": 1998 + }, + { + "epoch": 0.219525587524709, + "grad_norm": 2.543811559677124, + "learning_rate": 1e-06, + "loss": 0.9657, + "mean_token_accuracy": 0.7048302292823792, + "num_tokens": 50895460.0, + "step": 1999 + }, + { + "epoch": 0.21963540522732264, + "grad_norm": 1.9299403429031372, + "learning_rate": 1e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.7278801798820496, + "num_tokens": 50926352.0, + "step": 2000 + }, + { + "epoch": 0.2197452229299363, + "grad_norm": 2.50248122215271, + "learning_rate": 1e-06, + "loss": 1.0352, + "mean_token_accuracy": 0.6875660419464111, + "num_tokens": 50948773.0, + "step": 2001 + }, + { + "epoch": 0.21985504063254996, + "grad_norm": 2.4540505409240723, + "learning_rate": 1e-06, + "loss": 1.1068, + "mean_token_accuracy": 0.6778207421302795, + "num_tokens": 50973216.0, + "step": 2002 + }, + { + "epoch": 0.21996485833516363, + "grad_norm": 2.1891579627990723, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7129496335983276, + "num_tokens": 50999596.0, + "step": 2003 + }, + { + "epoch": 0.22007467603777728, + "grad_norm": 2.331700325012207, + "learning_rate": 1e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.708118736743927, + "num_tokens": 51022762.0, + "step": 2004 + }, + { + "epoch": 0.22018449374039095, + "grad_norm": 2.145411491394043, + "learning_rate": 1e-06, + "loss": 1.1174, + "mean_token_accuracy": 0.6740360260009766, + "num_tokens": 51053402.0, + "step": 2005 + }, + { + "epoch": 0.22029431144300463, + "grad_norm": 2.6406586170196533, + "learning_rate": 1e-06, + "loss": 0.968, + "mean_token_accuracy": 0.7160487771034241, + "num_tokens": 51074436.0, + "step": 2006 + }, + { + "epoch": 0.22040412914561827, + "grad_norm": 2.587428092956543, + "learning_rate": 1e-06, + "loss": 0.9733, + "mean_token_accuracy": 0.7043426036834717, + "num_tokens": 51094436.0, + "step": 2007 + }, + { + "epoch": 0.22051394684823195, + "grad_norm": 2.327181100845337, + "learning_rate": 1e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.7025684118270874, + "num_tokens": 51117353.0, + "step": 2008 + }, + { + "epoch": 0.2206237645508456, + "grad_norm": 2.315737009048462, + "learning_rate": 1e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.713979959487915, + "num_tokens": 51141074.0, + "step": 2009 + }, + { + "epoch": 0.22073358225345927, + "grad_norm": 2.5418500900268555, + "learning_rate": 1e-06, + "loss": 1.0738, + "mean_token_accuracy": 0.6841901540756226, + "num_tokens": 51162242.0, + "step": 2010 + }, + { + "epoch": 0.2208433999560729, + "grad_norm": 2.40753436088562, + "learning_rate": 1e-06, + "loss": 1.0755, + "mean_token_accuracy": 0.6757830381393433, + "num_tokens": 51187233.0, + "step": 2011 + }, + { + "epoch": 0.22095321765868658, + "grad_norm": 2.2168843746185303, + "learning_rate": 1e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7109310030937195, + "num_tokens": 51214017.0, + "step": 2012 + }, + { + "epoch": 0.22106303536130023, + "grad_norm": 2.472308397293091, + "learning_rate": 1e-06, + "loss": 0.9874, + "mean_token_accuracy": 0.698287308216095, + "num_tokens": 51236813.0, + "step": 2013 + }, + { + "epoch": 0.2211728530639139, + "grad_norm": 1.9867134094238281, + "learning_rate": 1e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.7029056549072266, + "num_tokens": 51266655.0, + "step": 2014 + }, + { + "epoch": 0.22128267076652755, + "grad_norm": 2.02474045753479, + "learning_rate": 1e-06, + "loss": 0.9943, + "mean_token_accuracy": 0.6993328332901001, + "num_tokens": 51299297.0, + "step": 2015 + }, + { + "epoch": 0.22139248846914122, + "grad_norm": 2.242001533508301, + "learning_rate": 1e-06, + "loss": 1.0602, + "mean_token_accuracy": 0.6893742084503174, + "num_tokens": 51328484.0, + "step": 2016 + }, + { + "epoch": 0.2215023061717549, + "grad_norm": 2.308377504348755, + "learning_rate": 1e-06, + "loss": 0.9236, + "mean_token_accuracy": 0.7150552272796631, + "num_tokens": 51353805.0, + "step": 2017 + }, + { + "epoch": 0.22161212387436854, + "grad_norm": 2.5043816566467285, + "learning_rate": 1e-06, + "loss": 1.0208, + "mean_token_accuracy": 0.6885407567024231, + "num_tokens": 51376676.0, + "step": 2018 + }, + { + "epoch": 0.22172194157698222, + "grad_norm": 2.288102149963379, + "learning_rate": 1e-06, + "loss": 1.0358, + "mean_token_accuracy": 0.7054587602615356, + "num_tokens": 51401552.0, + "step": 2019 + }, + { + "epoch": 0.22183175927959586, + "grad_norm": 2.208329439163208, + "learning_rate": 1e-06, + "loss": 1.0123, + "mean_token_accuracy": 0.6953347325325012, + "num_tokens": 51425857.0, + "step": 2020 + }, + { + "epoch": 0.22194157698220954, + "grad_norm": 2.3843493461608887, + "learning_rate": 1e-06, + "loss": 1.0106, + "mean_token_accuracy": 0.6913347244262695, + "num_tokens": 51449209.0, + "step": 2021 + }, + { + "epoch": 0.22205139468482318, + "grad_norm": 2.479018211364746, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.7015526294708252, + "num_tokens": 51471872.0, + "step": 2022 + }, + { + "epoch": 0.22216121238743686, + "grad_norm": 2.3297369480133057, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7169144749641418, + "num_tokens": 51496878.0, + "step": 2023 + }, + { + "epoch": 0.2222710300900505, + "grad_norm": 2.5577337741851807, + "learning_rate": 1e-06, + "loss": 1.0732, + "mean_token_accuracy": 0.6846556663513184, + "num_tokens": 51518935.0, + "step": 2024 + }, + { + "epoch": 0.22238084779266418, + "grad_norm": 2.6625211238861084, + "learning_rate": 1e-06, + "loss": 1.0215, + "mean_token_accuracy": 0.6907480955123901, + "num_tokens": 51543687.0, + "step": 2025 + }, + { + "epoch": 0.22249066549527785, + "grad_norm": 2.2362287044525146, + "learning_rate": 1e-06, + "loss": 1.0916, + "mean_token_accuracy": 0.6833585500717163, + "num_tokens": 51570070.0, + "step": 2026 + }, + { + "epoch": 0.2226004831978915, + "grad_norm": 2.4351041316986084, + "learning_rate": 1e-06, + "loss": 1.003, + "mean_token_accuracy": 0.7027872204780579, + "num_tokens": 51593097.0, + "step": 2027 + }, + { + "epoch": 0.22271030090050517, + "grad_norm": 2.390265703201294, + "learning_rate": 1e-06, + "loss": 1.0358, + "mean_token_accuracy": 0.6894906759262085, + "num_tokens": 51618039.0, + "step": 2028 + }, + { + "epoch": 0.22282011860311882, + "grad_norm": 2.402195930480957, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.7085619568824768, + "num_tokens": 51640209.0, + "step": 2029 + }, + { + "epoch": 0.2229299363057325, + "grad_norm": 2.736246109008789, + "learning_rate": 1e-06, + "loss": 0.8948, + "mean_token_accuracy": 0.7217655777931213, + "num_tokens": 51660355.0, + "step": 2030 + }, + { + "epoch": 0.22303975400834614, + "grad_norm": 2.2286853790283203, + "learning_rate": 1e-06, + "loss": 1.1896, + "mean_token_accuracy": 0.6564821004867554, + "num_tokens": 51690287.0, + "step": 2031 + }, + { + "epoch": 0.2231495717109598, + "grad_norm": 2.241964340209961, + "learning_rate": 1e-06, + "loss": 1.0927, + "mean_token_accuracy": 0.6707286238670349, + "num_tokens": 51719530.0, + "step": 2032 + }, + { + "epoch": 0.22325938941357346, + "grad_norm": 2.255629301071167, + "learning_rate": 1e-06, + "loss": 1.045, + "mean_token_accuracy": 0.6921504735946655, + "num_tokens": 51745584.0, + "step": 2033 + }, + { + "epoch": 0.22336920711618713, + "grad_norm": 2.1702470779418945, + "learning_rate": 1e-06, + "loss": 1.0327, + "mean_token_accuracy": 0.6966137290000916, + "num_tokens": 51774066.0, + "step": 2034 + }, + { + "epoch": 0.2234790248188008, + "grad_norm": 2.0745275020599365, + "learning_rate": 1e-06, + "loss": 0.9695, + "mean_token_accuracy": 0.707094132900238, + "num_tokens": 51805271.0, + "step": 2035 + }, + { + "epoch": 0.22358884252141445, + "grad_norm": 2.1499476432800293, + "learning_rate": 1e-06, + "loss": 0.9748, + "mean_token_accuracy": 0.7106186747550964, + "num_tokens": 51834083.0, + "step": 2036 + }, + { + "epoch": 0.22369866022402812, + "grad_norm": 2.136457681655884, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7052140235900879, + "num_tokens": 51861352.0, + "step": 2037 + }, + { + "epoch": 0.22380847792664177, + "grad_norm": 2.1245837211608887, + "learning_rate": 1e-06, + "loss": 1.0089, + "mean_token_accuracy": 0.6942780017852783, + "num_tokens": 51889530.0, + "step": 2038 + }, + { + "epoch": 0.22391829562925544, + "grad_norm": 2.0679235458374023, + "learning_rate": 1e-06, + "loss": 0.9159, + "mean_token_accuracy": 0.7139743566513062, + "num_tokens": 51917608.0, + "step": 2039 + }, + { + "epoch": 0.2240281133318691, + "grad_norm": 2.3371827602386475, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7205621600151062, + "num_tokens": 51942178.0, + "step": 2040 + }, + { + "epoch": 0.22413793103448276, + "grad_norm": 2.3051369190216064, + "learning_rate": 1e-06, + "loss": 1.0727, + "mean_token_accuracy": 0.6781035661697388, + "num_tokens": 51965895.0, + "step": 2041 + }, + { + "epoch": 0.2242477487370964, + "grad_norm": 2.1404173374176025, + "learning_rate": 1e-06, + "loss": 1.0009, + "mean_token_accuracy": 0.6980477571487427, + "num_tokens": 51993958.0, + "step": 2042 + }, + { + "epoch": 0.22435756643971008, + "grad_norm": 2.5653462409973145, + "learning_rate": 1e-06, + "loss": 0.9341, + "mean_token_accuracy": 0.731598973274231, + "num_tokens": 52015834.0, + "step": 2043 + }, + { + "epoch": 0.22446738414232376, + "grad_norm": 2.4477365016937256, + "learning_rate": 1e-06, + "loss": 0.9915, + "mean_token_accuracy": 0.7097945809364319, + "num_tokens": 52038677.0, + "step": 2044 + }, + { + "epoch": 0.2245772018449374, + "grad_norm": 2.150621175765991, + "learning_rate": 1e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.7047168016433716, + "num_tokens": 52065624.0, + "step": 2045 + }, + { + "epoch": 0.22468701954755108, + "grad_norm": 2.4338176250457764, + "learning_rate": 1e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.7044309377670288, + "num_tokens": 52088437.0, + "step": 2046 + }, + { + "epoch": 0.22479683725016472, + "grad_norm": 2.2365264892578125, + "learning_rate": 1e-06, + "loss": 0.972, + "mean_token_accuracy": 0.7172405123710632, + "num_tokens": 52113121.0, + "step": 2047 + }, + { + "epoch": 0.2249066549527784, + "grad_norm": 2.305530071258545, + "learning_rate": 1e-06, + "loss": 0.959, + "mean_token_accuracy": 0.7154105305671692, + "num_tokens": 52136972.0, + "step": 2048 + }, + { + "epoch": 0.22501647265539204, + "grad_norm": 2.8537323474884033, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7129664421081543, + "num_tokens": 52154346.0, + "step": 2049 + }, + { + "epoch": 0.22512629035800572, + "grad_norm": 2.083455801010132, + "learning_rate": 1e-06, + "loss": 1.0873, + "mean_token_accuracy": 0.6788713932037354, + "num_tokens": 52185691.0, + "step": 2050 + }, + { + "epoch": 0.22523610806061936, + "grad_norm": 2.400874376296997, + "learning_rate": 1e-06, + "loss": 1.0303, + "mean_token_accuracy": 0.6944245100021362, + "num_tokens": 52207341.0, + "step": 2051 + }, + { + "epoch": 0.22534592576323303, + "grad_norm": 2.0692780017852783, + "learning_rate": 1e-06, + "loss": 1.0638, + "mean_token_accuracy": 0.6858336925506592, + "num_tokens": 52235466.0, + "step": 2052 + }, + { + "epoch": 0.22545574346584668, + "grad_norm": 2.567138433456421, + "learning_rate": 1e-06, + "loss": 1.0453, + "mean_token_accuracy": 0.6840296983718872, + "num_tokens": 52256135.0, + "step": 2053 + }, + { + "epoch": 0.22556556116846035, + "grad_norm": 2.6417105197906494, + "learning_rate": 1e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.7032603621482849, + "num_tokens": 52276299.0, + "step": 2054 + }, + { + "epoch": 0.22567537887107403, + "grad_norm": 1.9793771505355835, + "learning_rate": 1e-06, + "loss": 0.9223, + "mean_token_accuracy": 0.7241644859313965, + "num_tokens": 52305455.0, + "step": 2055 + }, + { + "epoch": 0.22578519657368767, + "grad_norm": 2.2775766849517822, + "learning_rate": 1e-06, + "loss": 1.0281, + "mean_token_accuracy": 0.6795320510864258, + "num_tokens": 52331520.0, + "step": 2056 + }, + { + "epoch": 0.22589501427630135, + "grad_norm": 2.3910293579101562, + "learning_rate": 1e-06, + "loss": 0.99, + "mean_token_accuracy": 0.7002548575401306, + "num_tokens": 52352922.0, + "step": 2057 + }, + { + "epoch": 0.226004831978915, + "grad_norm": 2.53086256980896, + "learning_rate": 1e-06, + "loss": 0.9722, + "mean_token_accuracy": 0.6969095468521118, + "num_tokens": 52374984.0, + "step": 2058 + }, + { + "epoch": 0.22611464968152867, + "grad_norm": 2.0650553703308105, + "learning_rate": 1e-06, + "loss": 1.0085, + "mean_token_accuracy": 0.6941083073616028, + "num_tokens": 52405953.0, + "step": 2059 + }, + { + "epoch": 0.2262244673841423, + "grad_norm": 2.715658187866211, + "learning_rate": 1e-06, + "loss": 1.0464, + "mean_token_accuracy": 0.688171923160553, + "num_tokens": 52427111.0, + "step": 2060 + }, + { + "epoch": 0.226334285086756, + "grad_norm": 2.4424948692321777, + "learning_rate": 1e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.7180077433586121, + "num_tokens": 52448972.0, + "step": 2061 + }, + { + "epoch": 0.22644410278936963, + "grad_norm": 2.147761344909668, + "learning_rate": 1e-06, + "loss": 1.0362, + "mean_token_accuracy": 0.6901862621307373, + "num_tokens": 52478928.0, + "step": 2062 + }, + { + "epoch": 0.2265539204919833, + "grad_norm": 2.276064157485962, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.6959797739982605, + "num_tokens": 52504057.0, + "step": 2063 + }, + { + "epoch": 0.22666373819459698, + "grad_norm": 2.1539387702941895, + "learning_rate": 1e-06, + "loss": 1.0308, + "mean_token_accuracy": 0.6884894371032715, + "num_tokens": 52532242.0, + "step": 2064 + }, + { + "epoch": 0.22677355589721063, + "grad_norm": 2.3396267890930176, + "learning_rate": 1e-06, + "loss": 1.0317, + "mean_token_accuracy": 0.6885377764701843, + "num_tokens": 52557315.0, + "step": 2065 + }, + { + "epoch": 0.2268833735998243, + "grad_norm": 2.1896307468414307, + "learning_rate": 1e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.7102693319320679, + "num_tokens": 52584607.0, + "step": 2066 + }, + { + "epoch": 0.22699319130243795, + "grad_norm": 2.5675673484802246, + "learning_rate": 1e-06, + "loss": 1.0674, + "mean_token_accuracy": 0.6819000244140625, + "num_tokens": 52607953.0, + "step": 2067 + }, + { + "epoch": 0.22710300900505162, + "grad_norm": 2.4932661056518555, + "learning_rate": 1e-06, + "loss": 1.0015, + "mean_token_accuracy": 0.6965123414993286, + "num_tokens": 52631929.0, + "step": 2068 + }, + { + "epoch": 0.22721282670766527, + "grad_norm": 2.1289799213409424, + "learning_rate": 1e-06, + "loss": 1.0292, + "mean_token_accuracy": 0.6865724325180054, + "num_tokens": 52660061.0, + "step": 2069 + }, + { + "epoch": 0.22732264441027894, + "grad_norm": 2.3075287342071533, + "learning_rate": 1e-06, + "loss": 1.0341, + "mean_token_accuracy": 0.6913830041885376, + "num_tokens": 52685985.0, + "step": 2070 + }, + { + "epoch": 0.22743246211289259, + "grad_norm": 2.4740986824035645, + "learning_rate": 1e-06, + "loss": 1.011, + "mean_token_accuracy": 0.7000781297683716, + "num_tokens": 52705402.0, + "step": 2071 + }, + { + "epoch": 0.22754227981550626, + "grad_norm": 2.3028628826141357, + "learning_rate": 1e-06, + "loss": 0.975, + "mean_token_accuracy": 0.7022067904472351, + "num_tokens": 52729678.0, + "step": 2072 + }, + { + "epoch": 0.22765209751811993, + "grad_norm": 2.3606691360473633, + "learning_rate": 1e-06, + "loss": 0.9266, + "mean_token_accuracy": 0.7210338115692139, + "num_tokens": 52752385.0, + "step": 2073 + }, + { + "epoch": 0.22776191522073358, + "grad_norm": 2.1955881118774414, + "learning_rate": 1e-06, + "loss": 1.0359, + "mean_token_accuracy": 0.6947596073150635, + "num_tokens": 52779194.0, + "step": 2074 + }, + { + "epoch": 0.22787173292334725, + "grad_norm": 2.1290740966796875, + "learning_rate": 1e-06, + "loss": 0.9368, + "mean_token_accuracy": 0.7136991024017334, + "num_tokens": 52809598.0, + "step": 2075 + }, + { + "epoch": 0.2279815506259609, + "grad_norm": 2.1625165939331055, + "learning_rate": 1e-06, + "loss": 0.9825, + "mean_token_accuracy": 0.7022501230239868, + "num_tokens": 52836842.0, + "step": 2076 + }, + { + "epoch": 0.22809136832857457, + "grad_norm": 2.343787670135498, + "learning_rate": 1e-06, + "loss": 1.0287, + "mean_token_accuracy": 0.6908556222915649, + "num_tokens": 52862741.0, + "step": 2077 + }, + { + "epoch": 0.22820118603118822, + "grad_norm": 2.936276435852051, + "learning_rate": 1e-06, + "loss": 0.9065, + "mean_token_accuracy": 0.722873330116272, + "num_tokens": 52880770.0, + "step": 2078 + }, + { + "epoch": 0.2283110037338019, + "grad_norm": 2.5654003620147705, + "learning_rate": 1e-06, + "loss": 1.0242, + "mean_token_accuracy": 0.6860560178756714, + "num_tokens": 52901970.0, + "step": 2079 + }, + { + "epoch": 0.22842082143641554, + "grad_norm": 2.442230463027954, + "learning_rate": 1e-06, + "loss": 1.039, + "mean_token_accuracy": 0.6850303411483765, + "num_tokens": 52924511.0, + "step": 2080 + }, + { + "epoch": 0.2285306391390292, + "grad_norm": 2.3327152729034424, + "learning_rate": 1e-06, + "loss": 0.995, + "mean_token_accuracy": 0.6989922523498535, + "num_tokens": 52947925.0, + "step": 2081 + }, + { + "epoch": 0.2286404568416429, + "grad_norm": 2.00537371635437, + "learning_rate": 1e-06, + "loss": 0.9976, + "mean_token_accuracy": 0.69886314868927, + "num_tokens": 52980173.0, + "step": 2082 + }, + { + "epoch": 0.22875027454425653, + "grad_norm": 2.5515711307525635, + "learning_rate": 1e-06, + "loss": 1.0128, + "mean_token_accuracy": 0.7025765180587769, + "num_tokens": 53001993.0, + "step": 2083 + }, + { + "epoch": 0.2288600922468702, + "grad_norm": 2.3935649394989014, + "learning_rate": 1e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.6959378719329834, + "num_tokens": 53023794.0, + "step": 2084 + }, + { + "epoch": 0.22896990994948385, + "grad_norm": 2.445129871368408, + "learning_rate": 1e-06, + "loss": 0.8849, + "mean_token_accuracy": 0.7235903739929199, + "num_tokens": 53046793.0, + "step": 2085 + }, + { + "epoch": 0.22907972765209753, + "grad_norm": 2.3919296264648438, + "learning_rate": 1e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.7051019668579102, + "num_tokens": 53067554.0, + "step": 2086 + }, + { + "epoch": 0.22918954535471117, + "grad_norm": 2.3985767364501953, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7127484083175659, + "num_tokens": 53089482.0, + "step": 2087 + }, + { + "epoch": 0.22929936305732485, + "grad_norm": 2.4337780475616455, + "learning_rate": 1e-06, + "loss": 1.0869, + "mean_token_accuracy": 0.6859668493270874, + "num_tokens": 53112532.0, + "step": 2088 + }, + { + "epoch": 0.2294091807599385, + "grad_norm": 2.5149624347686768, + "learning_rate": 1e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.7038532495498657, + "num_tokens": 53135359.0, + "step": 2089 + }, + { + "epoch": 0.22951899846255217, + "grad_norm": 1.9399371147155762, + "learning_rate": 1e-06, + "loss": 0.986, + "mean_token_accuracy": 0.6940904855728149, + "num_tokens": 53170050.0, + "step": 2090 + }, + { + "epoch": 0.2296288161651658, + "grad_norm": 2.3939225673675537, + "learning_rate": 1e-06, + "loss": 0.9832, + "mean_token_accuracy": 0.7116403579711914, + "num_tokens": 53193846.0, + "step": 2091 + }, + { + "epoch": 0.22973863386777948, + "grad_norm": 2.1954238414764404, + "learning_rate": 1e-06, + "loss": 1.0489, + "mean_token_accuracy": 0.678149402141571, + "num_tokens": 53222469.0, + "step": 2092 + }, + { + "epoch": 0.22984845157039316, + "grad_norm": 1.9911929368972778, + "learning_rate": 1e-06, + "loss": 1.0986, + "mean_token_accuracy": 0.6740248203277588, + "num_tokens": 53253605.0, + "step": 2093 + }, + { + "epoch": 0.2299582692730068, + "grad_norm": 2.379385471343994, + "learning_rate": 1e-06, + "loss": 0.8846, + "mean_token_accuracy": 0.7253437042236328, + "num_tokens": 53274893.0, + "step": 2094 + }, + { + "epoch": 0.23006808697562048, + "grad_norm": 2.247135639190674, + "learning_rate": 1e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.7007882595062256, + "num_tokens": 53301220.0, + "step": 2095 + }, + { + "epoch": 0.23017790467823412, + "grad_norm": 2.752143144607544, + "learning_rate": 1e-06, + "loss": 1.0294, + "mean_token_accuracy": 0.6929007768630981, + "num_tokens": 53320400.0, + "step": 2096 + }, + { + "epoch": 0.2302877223808478, + "grad_norm": 2.090487241744995, + "learning_rate": 1e-06, + "loss": 0.8575, + "mean_token_accuracy": 0.7360125780105591, + "num_tokens": 53348481.0, + "step": 2097 + }, + { + "epoch": 0.23039754008346144, + "grad_norm": 2.7340500354766846, + "learning_rate": 1e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.7140562534332275, + "num_tokens": 53366393.0, + "step": 2098 + }, + { + "epoch": 0.23050735778607512, + "grad_norm": 2.0252010822296143, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7024390697479248, + "num_tokens": 53395087.0, + "step": 2099 + }, + { + "epoch": 0.23061717548868876, + "grad_norm": 2.1813437938690186, + "learning_rate": 1e-06, + "loss": 1.0586, + "mean_token_accuracy": 0.6940483450889587, + "num_tokens": 53425511.0, + "step": 2100 + }, + { + "epoch": 0.23072699319130244, + "grad_norm": 2.4372637271881104, + "learning_rate": 1e-06, + "loss": 1.0191, + "mean_token_accuracy": 0.6904784440994263, + "num_tokens": 53450667.0, + "step": 2101 + }, + { + "epoch": 0.2308368108939161, + "grad_norm": 2.1230661869049072, + "learning_rate": 1e-06, + "loss": 1.0249, + "mean_token_accuracy": 0.6949723958969116, + "num_tokens": 53479102.0, + "step": 2102 + }, + { + "epoch": 0.23094662859652976, + "grad_norm": 2.702461004257202, + "learning_rate": 1e-06, + "loss": 1.0414, + "mean_token_accuracy": 0.6927487850189209, + "num_tokens": 53499879.0, + "step": 2103 + }, + { + "epoch": 0.23105644629914343, + "grad_norm": 2.2109627723693848, + "learning_rate": 1e-06, + "loss": 1.0597, + "mean_token_accuracy": 0.6889059543609619, + "num_tokens": 53526205.0, + "step": 2104 + }, + { + "epoch": 0.23116626400175708, + "grad_norm": 2.5344974994659424, + "learning_rate": 1e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.7004834413528442, + "num_tokens": 53546634.0, + "step": 2105 + }, + { + "epoch": 0.23127608170437075, + "grad_norm": 2.0889289379119873, + "learning_rate": 1e-06, + "loss": 1.0265, + "mean_token_accuracy": 0.6946071982383728, + "num_tokens": 53573724.0, + "step": 2106 + }, + { + "epoch": 0.2313858994069844, + "grad_norm": 2.2327868938446045, + "learning_rate": 1e-06, + "loss": 1.0178, + "mean_token_accuracy": 0.6919989585876465, + "num_tokens": 53601381.0, + "step": 2107 + }, + { + "epoch": 0.23149571710959807, + "grad_norm": 2.212592124938965, + "learning_rate": 1e-06, + "loss": 1.0433, + "mean_token_accuracy": 0.6823094487190247, + "num_tokens": 53630656.0, + "step": 2108 + }, + { + "epoch": 0.23160553481221172, + "grad_norm": 2.4838130474090576, + "learning_rate": 1e-06, + "loss": 1.0419, + "mean_token_accuracy": 0.6881842613220215, + "num_tokens": 53650947.0, + "step": 2109 + }, + { + "epoch": 0.2317153525148254, + "grad_norm": 2.27036714553833, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.7106733322143555, + "num_tokens": 53674018.0, + "step": 2110 + }, + { + "epoch": 0.23182517021743906, + "grad_norm": 2.541583299636841, + "learning_rate": 1e-06, + "loss": 0.9858, + "mean_token_accuracy": 0.7023350596427917, + "num_tokens": 53694991.0, + "step": 2111 + }, + { + "epoch": 0.2319349879200527, + "grad_norm": 2.4240705966949463, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.7159647941589355, + "num_tokens": 53719000.0, + "step": 2112 + }, + { + "epoch": 0.23204480562266638, + "grad_norm": 2.4371542930603027, + "learning_rate": 1e-06, + "loss": 0.9776, + "mean_token_accuracy": 0.7033809423446655, + "num_tokens": 53741702.0, + "step": 2113 + }, + { + "epoch": 0.23215462332528003, + "grad_norm": 2.5231504440307617, + "learning_rate": 1e-06, + "loss": 1.0104, + "mean_token_accuracy": 0.6966745853424072, + "num_tokens": 53764325.0, + "step": 2114 + }, + { + "epoch": 0.2322644410278937, + "grad_norm": 2.443805694580078, + "learning_rate": 1e-06, + "loss": 0.999, + "mean_token_accuracy": 0.7000442147254944, + "num_tokens": 53789637.0, + "step": 2115 + }, + { + "epoch": 0.23237425873050735, + "grad_norm": 2.2342236042022705, + "learning_rate": 1e-06, + "loss": 1.0725, + "mean_token_accuracy": 0.6851662397384644, + "num_tokens": 53817006.0, + "step": 2116 + }, + { + "epoch": 0.23248407643312102, + "grad_norm": 2.097316265106201, + "learning_rate": 1e-06, + "loss": 0.9643, + "mean_token_accuracy": 0.709175705909729, + "num_tokens": 53845497.0, + "step": 2117 + }, + { + "epoch": 0.23259389413573467, + "grad_norm": 2.406385660171509, + "learning_rate": 1e-06, + "loss": 1.0165, + "mean_token_accuracy": 0.6917721033096313, + "num_tokens": 53870653.0, + "step": 2118 + }, + { + "epoch": 0.23270371183834834, + "grad_norm": 2.3378591537475586, + "learning_rate": 1e-06, + "loss": 1.0688, + "mean_token_accuracy": 0.6826199889183044, + "num_tokens": 53897246.0, + "step": 2119 + }, + { + "epoch": 0.23281352954096202, + "grad_norm": 2.4937491416931152, + "learning_rate": 1e-06, + "loss": 1.0723, + "mean_token_accuracy": 0.6789175271987915, + "num_tokens": 53919815.0, + "step": 2120 + }, + { + "epoch": 0.23292334724357566, + "grad_norm": 2.270315408706665, + "learning_rate": 1e-06, + "loss": 0.8805, + "mean_token_accuracy": 0.730963945388794, + "num_tokens": 53941494.0, + "step": 2121 + }, + { + "epoch": 0.23303316494618934, + "grad_norm": 2.3187484741210938, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.6999791860580444, + "num_tokens": 53965681.0, + "step": 2122 + }, + { + "epoch": 0.23314298264880298, + "grad_norm": 2.4755263328552246, + "learning_rate": 1e-06, + "loss": 1.0127, + "mean_token_accuracy": 0.6951647996902466, + "num_tokens": 53989865.0, + "step": 2123 + }, + { + "epoch": 0.23325280035141666, + "grad_norm": 2.537085771560669, + "learning_rate": 1e-06, + "loss": 1.0416, + "mean_token_accuracy": 0.6919451951980591, + "num_tokens": 54009559.0, + "step": 2124 + }, + { + "epoch": 0.2333626180540303, + "grad_norm": 2.3934590816497803, + "learning_rate": 1e-06, + "loss": 1.0085, + "mean_token_accuracy": 0.6951714754104614, + "num_tokens": 54034294.0, + "step": 2125 + }, + { + "epoch": 0.23347243575664398, + "grad_norm": 2.193981647491455, + "learning_rate": 1e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.7020395994186401, + "num_tokens": 54061350.0, + "step": 2126 + }, + { + "epoch": 0.23358225345925762, + "grad_norm": 2.482307195663452, + "learning_rate": 1e-06, + "loss": 1.0181, + "mean_token_accuracy": 0.6937825083732605, + "num_tokens": 54082819.0, + "step": 2127 + }, + { + "epoch": 0.2336920711618713, + "grad_norm": 2.1319949626922607, + "learning_rate": 1e-06, + "loss": 1.0356, + "mean_token_accuracy": 0.6898341178894043, + "num_tokens": 54112901.0, + "step": 2128 + }, + { + "epoch": 0.23380188886448494, + "grad_norm": 2.0291197299957275, + "learning_rate": 1e-06, + "loss": 1.134, + "mean_token_accuracy": 0.6618139743804932, + "num_tokens": 54147160.0, + "step": 2129 + }, + { + "epoch": 0.23391170656709862, + "grad_norm": 2.106228828430176, + "learning_rate": 1e-06, + "loss": 0.966, + "mean_token_accuracy": 0.7047050595283508, + "num_tokens": 54177274.0, + "step": 2130 + }, + { + "epoch": 0.2340215242697123, + "grad_norm": 2.344583511352539, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.7071311473846436, + "num_tokens": 54199948.0, + "step": 2131 + }, + { + "epoch": 0.23413134197232593, + "grad_norm": 2.2718870639801025, + "learning_rate": 1e-06, + "loss": 1.0117, + "mean_token_accuracy": 0.6960811614990234, + "num_tokens": 54225232.0, + "step": 2132 + }, + { + "epoch": 0.2342411596749396, + "grad_norm": 2.2392759323120117, + "learning_rate": 1e-06, + "loss": 0.9992, + "mean_token_accuracy": 0.7043716907501221, + "num_tokens": 54250978.0, + "step": 2133 + }, + { + "epoch": 0.23435097737755325, + "grad_norm": 2.2460954189300537, + "learning_rate": 1e-06, + "loss": 1.0304, + "mean_token_accuracy": 0.6875712275505066, + "num_tokens": 54278315.0, + "step": 2134 + }, + { + "epoch": 0.23446079508016693, + "grad_norm": 2.6109108924865723, + "learning_rate": 1e-06, + "loss": 0.9905, + "mean_token_accuracy": 0.6984003782272339, + "num_tokens": 54300917.0, + "step": 2135 + }, + { + "epoch": 0.23457061278278057, + "grad_norm": 2.856067419052124, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.7062479257583618, + "num_tokens": 54316791.0, + "step": 2136 + }, + { + "epoch": 0.23468043048539425, + "grad_norm": 2.213303804397583, + "learning_rate": 1e-06, + "loss": 1.03, + "mean_token_accuracy": 0.686913251876831, + "num_tokens": 54343029.0, + "step": 2137 + }, + { + "epoch": 0.2347902481880079, + "grad_norm": 2.4744672775268555, + "learning_rate": 1e-06, + "loss": 1.0758, + "mean_token_accuracy": 0.6792171597480774, + "num_tokens": 54366983.0, + "step": 2138 + }, + { + "epoch": 0.23490006589062157, + "grad_norm": 2.3048009872436523, + "learning_rate": 1e-06, + "loss": 1.0882, + "mean_token_accuracy": 0.6738025546073914, + "num_tokens": 54394213.0, + "step": 2139 + }, + { + "epoch": 0.23500988359323524, + "grad_norm": 2.4037888050079346, + "learning_rate": 1e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.724210262298584, + "num_tokens": 54415102.0, + "step": 2140 + }, + { + "epoch": 0.2351197012958489, + "grad_norm": 2.073432207107544, + "learning_rate": 1e-06, + "loss": 0.9919, + "mean_token_accuracy": 0.7000595331192017, + "num_tokens": 54445414.0, + "step": 2141 + }, + { + "epoch": 0.23522951899846256, + "grad_norm": 2.2899322509765625, + "learning_rate": 1e-06, + "loss": 1.0648, + "mean_token_accuracy": 0.6828083395957947, + "num_tokens": 54471819.0, + "step": 2142 + }, + { + "epoch": 0.2353393367010762, + "grad_norm": 1.9378196001052856, + "learning_rate": 1e-06, + "loss": 1.1003, + "mean_token_accuracy": 0.6778292655944824, + "num_tokens": 54506258.0, + "step": 2143 + }, + { + "epoch": 0.23544915440368988, + "grad_norm": 2.033268690109253, + "learning_rate": 1e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.720031201839447, + "num_tokens": 54535991.0, + "step": 2144 + }, + { + "epoch": 0.23555897210630353, + "grad_norm": 2.3299942016601562, + "learning_rate": 1e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.7009226083755493, + "num_tokens": 54558893.0, + "step": 2145 + }, + { + "epoch": 0.2356687898089172, + "grad_norm": 2.3948302268981934, + "learning_rate": 1e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.701552152633667, + "num_tokens": 54583494.0, + "step": 2146 + }, + { + "epoch": 0.23577860751153085, + "grad_norm": 2.328108787536621, + "learning_rate": 1e-06, + "loss": 1.1308, + "mean_token_accuracy": 0.6663927435874939, + "num_tokens": 54610214.0, + "step": 2147 + }, + { + "epoch": 0.23588842521414452, + "grad_norm": 1.9204150438308716, + "learning_rate": 1e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.7101823091506958, + "num_tokens": 54642126.0, + "step": 2148 + }, + { + "epoch": 0.2359982429167582, + "grad_norm": 2.497809410095215, + "learning_rate": 1e-06, + "loss": 0.9449, + "mean_token_accuracy": 0.7202571630477905, + "num_tokens": 54661962.0, + "step": 2149 + }, + { + "epoch": 0.23610806061937184, + "grad_norm": 2.304187297821045, + "learning_rate": 1e-06, + "loss": 1.0564, + "mean_token_accuracy": 0.6950574517250061, + "num_tokens": 54686494.0, + "step": 2150 + }, + { + "epoch": 0.2362178783219855, + "grad_norm": 2.2445435523986816, + "learning_rate": 1e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.7002386450767517, + "num_tokens": 54712267.0, + "step": 2151 + }, + { + "epoch": 0.23632769602459916, + "grad_norm": 2.198967218399048, + "learning_rate": 1e-06, + "loss": 1.0974, + "mean_token_accuracy": 0.6808609962463379, + "num_tokens": 54742047.0, + "step": 2152 + }, + { + "epoch": 0.23643751372721283, + "grad_norm": 2.1880784034729004, + "learning_rate": 1e-06, + "loss": 0.9193, + "mean_token_accuracy": 0.7184418439865112, + "num_tokens": 54768372.0, + "step": 2153 + }, + { + "epoch": 0.23654733142982648, + "grad_norm": 2.306666374206543, + "learning_rate": 1e-06, + "loss": 1.1021, + "mean_token_accuracy": 0.6743344664573669, + "num_tokens": 54794130.0, + "step": 2154 + }, + { + "epoch": 0.23665714913244015, + "grad_norm": 2.0499000549316406, + "learning_rate": 1e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.719855785369873, + "num_tokens": 54823598.0, + "step": 2155 + }, + { + "epoch": 0.2367669668350538, + "grad_norm": 2.384493589401245, + "learning_rate": 1e-06, + "loss": 1.0873, + "mean_token_accuracy": 0.6795088052749634, + "num_tokens": 54848449.0, + "step": 2156 + }, + { + "epoch": 0.23687678453766747, + "grad_norm": 2.6304092407226562, + "learning_rate": 1e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7231792211532593, + "num_tokens": 54867815.0, + "step": 2157 + }, + { + "epoch": 0.23698660224028115, + "grad_norm": 2.254560947418213, + "learning_rate": 1e-06, + "loss": 1.0065, + "mean_token_accuracy": 0.6987643241882324, + "num_tokens": 54893093.0, + "step": 2158 + }, + { + "epoch": 0.2370964199428948, + "grad_norm": 2.260361433029175, + "learning_rate": 1e-06, + "loss": 0.969, + "mean_token_accuracy": 0.7075074315071106, + "num_tokens": 54917247.0, + "step": 2159 + }, + { + "epoch": 0.23720623764550847, + "grad_norm": 2.4418158531188965, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7034426331520081, + "num_tokens": 54941043.0, + "step": 2160 + }, + { + "epoch": 0.2373160553481221, + "grad_norm": 2.437800168991089, + "learning_rate": 1e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7086122035980225, + "num_tokens": 54962591.0, + "step": 2161 + }, + { + "epoch": 0.2374258730507358, + "grad_norm": 2.190983295440674, + "learning_rate": 1e-06, + "loss": 0.999, + "mean_token_accuracy": 0.7061799764633179, + "num_tokens": 54989448.0, + "step": 2162 + }, + { + "epoch": 0.23753569075334943, + "grad_norm": 2.484288215637207, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.7046880722045898, + "num_tokens": 55011612.0, + "step": 2163 + }, + { + "epoch": 0.2376455084559631, + "grad_norm": 2.421431541442871, + "learning_rate": 1e-06, + "loss": 0.9884, + "mean_token_accuracy": 0.6942629814147949, + "num_tokens": 55034620.0, + "step": 2164 + }, + { + "epoch": 0.23775532615857675, + "grad_norm": 2.0681610107421875, + "learning_rate": 1e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.7103486061096191, + "num_tokens": 55061852.0, + "step": 2165 + }, + { + "epoch": 0.23786514386119043, + "grad_norm": 2.2335116863250732, + "learning_rate": 1e-06, + "loss": 0.9983, + "mean_token_accuracy": 0.6950339078903198, + "num_tokens": 55087612.0, + "step": 2166 + }, + { + "epoch": 0.23797496156380407, + "grad_norm": 2.2044432163238525, + "learning_rate": 1e-06, + "loss": 1.0449, + "mean_token_accuracy": 0.6885547637939453, + "num_tokens": 55115676.0, + "step": 2167 + }, + { + "epoch": 0.23808477926641775, + "grad_norm": 2.341808795928955, + "learning_rate": 1e-06, + "loss": 0.941, + "mean_token_accuracy": 0.7093411087989807, + "num_tokens": 55140466.0, + "step": 2168 + }, + { + "epoch": 0.23819459696903142, + "grad_norm": 2.210948944091797, + "learning_rate": 1e-06, + "loss": 1.0461, + "mean_token_accuracy": 0.685498833656311, + "num_tokens": 55166225.0, + "step": 2169 + }, + { + "epoch": 0.23830441467164507, + "grad_norm": 2.308958053588867, + "learning_rate": 1e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.710139811038971, + "num_tokens": 55191635.0, + "step": 2170 + }, + { + "epoch": 0.23841423237425874, + "grad_norm": 2.3357176780700684, + "learning_rate": 1e-06, + "loss": 1.0311, + "mean_token_accuracy": 0.6942014694213867, + "num_tokens": 55217716.0, + "step": 2171 + }, + { + "epoch": 0.23852405007687238, + "grad_norm": 2.4304559230804443, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.7145993113517761, + "num_tokens": 55238971.0, + "step": 2172 + }, + { + "epoch": 0.23863386777948606, + "grad_norm": 2.131880760192871, + "learning_rate": 1e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.692180871963501, + "num_tokens": 55266695.0, + "step": 2173 + }, + { + "epoch": 0.2387436854820997, + "grad_norm": 1.9953655004501343, + "learning_rate": 1e-06, + "loss": 1.0793, + "mean_token_accuracy": 0.6796666383743286, + "num_tokens": 55300417.0, + "step": 2174 + }, + { + "epoch": 0.23885350318471338, + "grad_norm": 2.221583843231201, + "learning_rate": 1e-06, + "loss": 1.0088, + "mean_token_accuracy": 0.7007870674133301, + "num_tokens": 55327465.0, + "step": 2175 + }, + { + "epoch": 0.23896332088732702, + "grad_norm": 2.0255377292633057, + "learning_rate": 1e-06, + "loss": 1.0474, + "mean_token_accuracy": 0.6826109886169434, + "num_tokens": 55359791.0, + "step": 2176 + }, + { + "epoch": 0.2390731385899407, + "grad_norm": 2.883002758026123, + "learning_rate": 1e-06, + "loss": 0.958, + "mean_token_accuracy": 0.7026767730712891, + "num_tokens": 55375516.0, + "step": 2177 + }, + { + "epoch": 0.23918295629255437, + "grad_norm": 2.631251811981201, + "learning_rate": 1e-06, + "loss": 0.9493, + "mean_token_accuracy": 0.7116241455078125, + "num_tokens": 55394815.0, + "step": 2178 + }, + { + "epoch": 0.23929277399516802, + "grad_norm": 2.0116631984710693, + "learning_rate": 1e-06, + "loss": 1.0589, + "mean_token_accuracy": 0.6784254312515259, + "num_tokens": 55425842.0, + "step": 2179 + }, + { + "epoch": 0.2394025916977817, + "grad_norm": 2.3454198837280273, + "learning_rate": 1e-06, + "loss": 1.0371, + "mean_token_accuracy": 0.6908254027366638, + "num_tokens": 55451260.0, + "step": 2180 + }, + { + "epoch": 0.23951240940039534, + "grad_norm": 2.3445193767547607, + "learning_rate": 1e-06, + "loss": 1.0039, + "mean_token_accuracy": 0.6991550922393799, + "num_tokens": 55475546.0, + "step": 2181 + }, + { + "epoch": 0.239622227103009, + "grad_norm": 2.2867395877838135, + "learning_rate": 1e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.7008069157600403, + "num_tokens": 55500146.0, + "step": 2182 + }, + { + "epoch": 0.23973204480562266, + "grad_norm": 2.307905912399292, + "learning_rate": 1e-06, + "loss": 1.048, + "mean_token_accuracy": 0.6844333410263062, + "num_tokens": 55526453.0, + "step": 2183 + }, + { + "epoch": 0.23984186250823633, + "grad_norm": 2.1205358505249023, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.6981453895568848, + "num_tokens": 55551660.0, + "step": 2184 + }, + { + "epoch": 0.23995168021084998, + "grad_norm": 2.1828572750091553, + "learning_rate": 1e-06, + "loss": 1.0049, + "mean_token_accuracy": 0.705957293510437, + "num_tokens": 55577630.0, + "step": 2185 + }, + { + "epoch": 0.24006149791346365, + "grad_norm": 2.372354507446289, + "learning_rate": 1e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7152517437934875, + "num_tokens": 55601351.0, + "step": 2186 + }, + { + "epoch": 0.24017131561607732, + "grad_norm": 2.425006628036499, + "learning_rate": 1e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.6930141448974609, + "num_tokens": 55623396.0, + "step": 2187 + }, + { + "epoch": 0.24028113331869097, + "grad_norm": 1.8790740966796875, + "learning_rate": 1e-06, + "loss": 1.0213, + "mean_token_accuracy": 0.6895834803581238, + "num_tokens": 55658492.0, + "step": 2188 + }, + { + "epoch": 0.24039095102130464, + "grad_norm": 2.2617392539978027, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.7164575457572937, + "num_tokens": 55684499.0, + "step": 2189 + }, + { + "epoch": 0.2405007687239183, + "grad_norm": 2.273245096206665, + "learning_rate": 1e-06, + "loss": 1.0534, + "mean_token_accuracy": 0.6911094784736633, + "num_tokens": 55709916.0, + "step": 2190 + }, + { + "epoch": 0.24061058642653196, + "grad_norm": 2.1431524753570557, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7070021629333496, + "num_tokens": 55737714.0, + "step": 2191 + }, + { + "epoch": 0.2407204041291456, + "grad_norm": 2.4125969409942627, + "learning_rate": 1e-06, + "loss": 1.071, + "mean_token_accuracy": 0.6780654191970825, + "num_tokens": 55760603.0, + "step": 2192 + }, + { + "epoch": 0.24083022183175928, + "grad_norm": 2.271341562271118, + "learning_rate": 1e-06, + "loss": 0.9751, + "mean_token_accuracy": 0.712507426738739, + "num_tokens": 55784859.0, + "step": 2193 + }, + { + "epoch": 0.24094003953437293, + "grad_norm": 2.431840419769287, + "learning_rate": 1e-06, + "loss": 1.011, + "mean_token_accuracy": 0.7077707648277283, + "num_tokens": 55809734.0, + "step": 2194 + }, + { + "epoch": 0.2410498572369866, + "grad_norm": 2.37831974029541, + "learning_rate": 1e-06, + "loss": 0.9865, + "mean_token_accuracy": 0.705618143081665, + "num_tokens": 55832722.0, + "step": 2195 + }, + { + "epoch": 0.24115967493960028, + "grad_norm": 2.1919827461242676, + "learning_rate": 1e-06, + "loss": 1.077, + "mean_token_accuracy": 0.6788393259048462, + "num_tokens": 55861492.0, + "step": 2196 + }, + { + "epoch": 0.24126949264221392, + "grad_norm": 2.037166118621826, + "learning_rate": 1e-06, + "loss": 1.0763, + "mean_token_accuracy": 0.6961545944213867, + "num_tokens": 55892491.0, + "step": 2197 + }, + { + "epoch": 0.2413793103448276, + "grad_norm": 2.1785354614257812, + "learning_rate": 1e-06, + "loss": 1.031, + "mean_token_accuracy": 0.6892328858375549, + "num_tokens": 55919660.0, + "step": 2198 + }, + { + "epoch": 0.24148912804744124, + "grad_norm": 2.2322745323181152, + "learning_rate": 1e-06, + "loss": 0.9894, + "mean_token_accuracy": 0.7017158269882202, + "num_tokens": 55944859.0, + "step": 2199 + }, + { + "epoch": 0.24159894575005492, + "grad_norm": 2.578834056854248, + "learning_rate": 1e-06, + "loss": 1.0302, + "mean_token_accuracy": 0.6953403949737549, + "num_tokens": 55967638.0, + "step": 2200 + }, + { + "epoch": 0.24170876345266856, + "grad_norm": 2.313112258911133, + "learning_rate": 1e-06, + "loss": 0.9172, + "mean_token_accuracy": 0.7251899242401123, + "num_tokens": 55990180.0, + "step": 2201 + }, + { + "epoch": 0.24181858115528224, + "grad_norm": 2.0507524013519287, + "learning_rate": 1e-06, + "loss": 1.0351, + "mean_token_accuracy": 0.6842305064201355, + "num_tokens": 56019332.0, + "step": 2202 + }, + { + "epoch": 0.24192839885789588, + "grad_norm": 2.081040382385254, + "learning_rate": 1e-06, + "loss": 1.0354, + "mean_token_accuracy": 0.6846628785133362, + "num_tokens": 56047783.0, + "step": 2203 + }, + { + "epoch": 0.24203821656050956, + "grad_norm": 2.105503559112549, + "learning_rate": 1e-06, + "loss": 1.062, + "mean_token_accuracy": 0.6773991584777832, + "num_tokens": 56077140.0, + "step": 2204 + }, + { + "epoch": 0.2421480342631232, + "grad_norm": 2.2909181118011475, + "learning_rate": 1e-06, + "loss": 1.0159, + "mean_token_accuracy": 0.6936750411987305, + "num_tokens": 56102724.0, + "step": 2205 + }, + { + "epoch": 0.24225785196573688, + "grad_norm": 2.6491475105285645, + "learning_rate": 1e-06, + "loss": 1.0125, + "mean_token_accuracy": 0.6961191296577454, + "num_tokens": 56123489.0, + "step": 2206 + }, + { + "epoch": 0.24236766966835055, + "grad_norm": 2.2766942977905273, + "learning_rate": 1e-06, + "loss": 1.0683, + "mean_token_accuracy": 0.6761793494224548, + "num_tokens": 56149333.0, + "step": 2207 + }, + { + "epoch": 0.2424774873709642, + "grad_norm": 2.1186513900756836, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7155776619911194, + "num_tokens": 56178059.0, + "step": 2208 + }, + { + "epoch": 0.24258730507357787, + "grad_norm": 2.053400993347168, + "learning_rate": 1e-06, + "loss": 1.087, + "mean_token_accuracy": 0.6800116300582886, + "num_tokens": 56210607.0, + "step": 2209 + }, + { + "epoch": 0.24269712277619152, + "grad_norm": 2.4926037788391113, + "learning_rate": 1e-06, + "loss": 1.0064, + "mean_token_accuracy": 0.6931759119033813, + "num_tokens": 56232609.0, + "step": 2210 + }, + { + "epoch": 0.2428069404788052, + "grad_norm": 2.2100272178649902, + "learning_rate": 1e-06, + "loss": 0.9946, + "mean_token_accuracy": 0.7059435844421387, + "num_tokens": 56258827.0, + "step": 2211 + }, + { + "epoch": 0.24291675818141883, + "grad_norm": 2.2188720703125, + "learning_rate": 1e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.707785964012146, + "num_tokens": 56284687.0, + "step": 2212 + }, + { + "epoch": 0.2430265758840325, + "grad_norm": 2.2776362895965576, + "learning_rate": 1e-06, + "loss": 1.0478, + "mean_token_accuracy": 0.6875290870666504, + "num_tokens": 56309504.0, + "step": 2213 + }, + { + "epoch": 0.24313639358664615, + "grad_norm": 2.0864579677581787, + "learning_rate": 1e-06, + "loss": 1.1057, + "mean_token_accuracy": 0.6693441271781921, + "num_tokens": 56339527.0, + "step": 2214 + }, + { + "epoch": 0.24324621128925983, + "grad_norm": 2.105501651763916, + "learning_rate": 1e-06, + "loss": 0.9694, + "mean_token_accuracy": 0.7028895616531372, + "num_tokens": 56367953.0, + "step": 2215 + }, + { + "epoch": 0.2433560289918735, + "grad_norm": 2.1265599727630615, + "learning_rate": 1e-06, + "loss": 1.1053, + "mean_token_accuracy": 0.6823322176933289, + "num_tokens": 56398014.0, + "step": 2216 + }, + { + "epoch": 0.24346584669448715, + "grad_norm": 2.2051823139190674, + "learning_rate": 1e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.7082409858703613, + "num_tokens": 56425001.0, + "step": 2217 + }, + { + "epoch": 0.24357566439710082, + "grad_norm": 2.3522517681121826, + "learning_rate": 1e-06, + "loss": 1.0171, + "mean_token_accuracy": 0.6935387849807739, + "num_tokens": 56449392.0, + "step": 2218 + }, + { + "epoch": 0.24368548209971447, + "grad_norm": 2.544327974319458, + "learning_rate": 1e-06, + "loss": 1.0696, + "mean_token_accuracy": 0.6762909889221191, + "num_tokens": 56471473.0, + "step": 2219 + }, + { + "epoch": 0.24379529980232814, + "grad_norm": 2.113163471221924, + "learning_rate": 1e-06, + "loss": 1.035, + "mean_token_accuracy": 0.6902037858963013, + "num_tokens": 56501971.0, + "step": 2220 + }, + { + "epoch": 0.2439051175049418, + "grad_norm": 2.4087295532226562, + "learning_rate": 1e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.7156819701194763, + "num_tokens": 56524548.0, + "step": 2221 + }, + { + "epoch": 0.24401493520755546, + "grad_norm": 2.034850597381592, + "learning_rate": 1e-06, + "loss": 1.0409, + "mean_token_accuracy": 0.685451865196228, + "num_tokens": 56554539.0, + "step": 2222 + }, + { + "epoch": 0.2441247529101691, + "grad_norm": 2.322009563446045, + "learning_rate": 1e-06, + "loss": 0.9109, + "mean_token_accuracy": 0.7141666412353516, + "num_tokens": 56576813.0, + "step": 2223 + }, + { + "epoch": 0.24423457061278278, + "grad_norm": 2.0131490230560303, + "learning_rate": 1e-06, + "loss": 1.0018, + "mean_token_accuracy": 0.6989002823829651, + "num_tokens": 56606045.0, + "step": 2224 + }, + { + "epoch": 0.24434438831539645, + "grad_norm": 2.0563595294952393, + "learning_rate": 1e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.6986249685287476, + "num_tokens": 56635428.0, + "step": 2225 + }, + { + "epoch": 0.2444542060180101, + "grad_norm": 2.437542676925659, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.7161372900009155, + "num_tokens": 56658658.0, + "step": 2226 + }, + { + "epoch": 0.24456402372062377, + "grad_norm": 2.364462375640869, + "learning_rate": 1e-06, + "loss": 0.9804, + "mean_token_accuracy": 0.7041228413581848, + "num_tokens": 56681358.0, + "step": 2227 + }, + { + "epoch": 0.24467384142323742, + "grad_norm": 2.404661178588867, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7177817225456238, + "num_tokens": 56702268.0, + "step": 2228 + }, + { + "epoch": 0.2447836591258511, + "grad_norm": 2.228306770324707, + "learning_rate": 1e-06, + "loss": 1.0513, + "mean_token_accuracy": 0.6826875805854797, + "num_tokens": 56731217.0, + "step": 2229 + }, + { + "epoch": 0.24489347682846474, + "grad_norm": 2.280898094177246, + "learning_rate": 1e-06, + "loss": 1.0255, + "mean_token_accuracy": 0.6972998976707458, + "num_tokens": 56754951.0, + "step": 2230 + }, + { + "epoch": 0.2450032945310784, + "grad_norm": 2.0688111782073975, + "learning_rate": 1e-06, + "loss": 1.0869, + "mean_token_accuracy": 0.6843121647834778, + "num_tokens": 56786047.0, + "step": 2231 + }, + { + "epoch": 0.24511311223369206, + "grad_norm": 2.4347968101501465, + "learning_rate": 1e-06, + "loss": 1.0444, + "mean_token_accuracy": 0.685230553150177, + "num_tokens": 56809427.0, + "step": 2232 + }, + { + "epoch": 0.24522292993630573, + "grad_norm": 2.4113192558288574, + "learning_rate": 1e-06, + "loss": 1.0477, + "mean_token_accuracy": 0.6875576972961426, + "num_tokens": 56833494.0, + "step": 2233 + }, + { + "epoch": 0.2453327476389194, + "grad_norm": 2.5635910034179688, + "learning_rate": 1e-06, + "loss": 1.0504, + "mean_token_accuracy": 0.6823306083679199, + "num_tokens": 56855262.0, + "step": 2234 + }, + { + "epoch": 0.24544256534153305, + "grad_norm": 2.6066787242889404, + "learning_rate": 1e-06, + "loss": 0.9453, + "mean_token_accuracy": 0.7111574411392212, + "num_tokens": 56874483.0, + "step": 2235 + }, + { + "epoch": 0.24555238304414673, + "grad_norm": 2.279331684112549, + "learning_rate": 1e-06, + "loss": 0.9645, + "mean_token_accuracy": 0.7113710045814514, + "num_tokens": 56898153.0, + "step": 2236 + }, + { + "epoch": 0.24566220074676037, + "grad_norm": 2.163665771484375, + "learning_rate": 1e-06, + "loss": 1.0074, + "mean_token_accuracy": 0.6956338882446289, + "num_tokens": 56925805.0, + "step": 2237 + }, + { + "epoch": 0.24577201844937405, + "grad_norm": 2.2462058067321777, + "learning_rate": 1e-06, + "loss": 1.0238, + "mean_token_accuracy": 0.6866099238395691, + "num_tokens": 56950738.0, + "step": 2238 + }, + { + "epoch": 0.2458818361519877, + "grad_norm": 2.19545578956604, + "learning_rate": 1e-06, + "loss": 1.0179, + "mean_token_accuracy": 0.6968269348144531, + "num_tokens": 56978969.0, + "step": 2239 + }, + { + "epoch": 0.24599165385460137, + "grad_norm": 2.2180566787719727, + "learning_rate": 1e-06, + "loss": 1.0495, + "mean_token_accuracy": 0.688217282295227, + "num_tokens": 57008252.0, + "step": 2240 + }, + { + "epoch": 0.246101471557215, + "grad_norm": 2.356452703475952, + "learning_rate": 1e-06, + "loss": 1.0418, + "mean_token_accuracy": 0.6869644522666931, + "num_tokens": 57031510.0, + "step": 2241 + }, + { + "epoch": 0.2462112892598287, + "grad_norm": 2.4618959426879883, + "learning_rate": 1e-06, + "loss": 0.9192, + "mean_token_accuracy": 0.7168189287185669, + "num_tokens": 57050613.0, + "step": 2242 + }, + { + "epoch": 0.24632110696244233, + "grad_norm": 2.2427480220794678, + "learning_rate": 1e-06, + "loss": 1.0904, + "mean_token_accuracy": 0.6739645004272461, + "num_tokens": 57077006.0, + "step": 2243 + }, + { + "epoch": 0.246430924665056, + "grad_norm": 2.4501802921295166, + "learning_rate": 1e-06, + "loss": 1.0002, + "mean_token_accuracy": 0.7038519978523254, + "num_tokens": 57102240.0, + "step": 2244 + }, + { + "epoch": 0.24654074236766968, + "grad_norm": 2.3722922801971436, + "learning_rate": 1e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.7116146683692932, + "num_tokens": 57126176.0, + "step": 2245 + }, + { + "epoch": 0.24665056007028333, + "grad_norm": 2.27860689163208, + "learning_rate": 1e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.7220499515533447, + "num_tokens": 57150496.0, + "step": 2246 + }, + { + "epoch": 0.246760377772897, + "grad_norm": 1.9271937608718872, + "learning_rate": 1e-06, + "loss": 0.9356, + "mean_token_accuracy": 0.7201933860778809, + "num_tokens": 57182884.0, + "step": 2247 + }, + { + "epoch": 0.24687019547551065, + "grad_norm": 2.686306953430176, + "learning_rate": 1e-06, + "loss": 0.94, + "mean_token_accuracy": 0.7202092409133911, + "num_tokens": 57201784.0, + "step": 2248 + }, + { + "epoch": 0.24698001317812432, + "grad_norm": 2.329124927520752, + "learning_rate": 1e-06, + "loss": 1.0388, + "mean_token_accuracy": 0.6945287585258484, + "num_tokens": 57226193.0, + "step": 2249 + }, + { + "epoch": 0.24708983088073797, + "grad_norm": 2.0356345176696777, + "learning_rate": 1e-06, + "loss": 1.0665, + "mean_token_accuracy": 0.676153302192688, + "num_tokens": 57256558.0, + "step": 2250 + }, + { + "epoch": 0.24719964858335164, + "grad_norm": 2.2250003814697266, + "learning_rate": 1e-06, + "loss": 1.0999, + "mean_token_accuracy": 0.6776283383369446, + "num_tokens": 57286279.0, + "step": 2251 + }, + { + "epoch": 0.24730946628596528, + "grad_norm": 2.232221841812134, + "learning_rate": 1e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.7194472551345825, + "num_tokens": 57312357.0, + "step": 2252 + }, + { + "epoch": 0.24741928398857896, + "grad_norm": 2.320077896118164, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7175554037094116, + "num_tokens": 57336751.0, + "step": 2253 + }, + { + "epoch": 0.24752910169119263, + "grad_norm": 2.29146671295166, + "learning_rate": 1e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.708365261554718, + "num_tokens": 57360520.0, + "step": 2254 + }, + { + "epoch": 0.24763891939380628, + "grad_norm": 2.0822227001190186, + "learning_rate": 1e-06, + "loss": 1.01, + "mean_token_accuracy": 0.7020338773727417, + "num_tokens": 57387989.0, + "step": 2255 + }, + { + "epoch": 0.24774873709641995, + "grad_norm": 2.1887896060943604, + "learning_rate": 1e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.6955956220626831, + "num_tokens": 57414185.0, + "step": 2256 + }, + { + "epoch": 0.2478585547990336, + "grad_norm": 1.925149917602539, + "learning_rate": 1e-06, + "loss": 1.0573, + "mean_token_accuracy": 0.6830187439918518, + "num_tokens": 57450172.0, + "step": 2257 + }, + { + "epoch": 0.24796837250164727, + "grad_norm": 2.1500158309936523, + "learning_rate": 1e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.6954872608184814, + "num_tokens": 57476122.0, + "step": 2258 + }, + { + "epoch": 0.24807819020426092, + "grad_norm": 2.247972011566162, + "learning_rate": 1e-06, + "loss": 1.0721, + "mean_token_accuracy": 0.6737985610961914, + "num_tokens": 57502340.0, + "step": 2259 + }, + { + "epoch": 0.2481880079068746, + "grad_norm": 2.121786117553711, + "learning_rate": 1e-06, + "loss": 1.0531, + "mean_token_accuracy": 0.6946893930435181, + "num_tokens": 57529748.0, + "step": 2260 + }, + { + "epoch": 0.24829782560948824, + "grad_norm": 2.2493135929107666, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7105108499526978, + "num_tokens": 57554208.0, + "step": 2261 + }, + { + "epoch": 0.2484076433121019, + "grad_norm": 2.3062710762023926, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.6998770236968994, + "num_tokens": 57578687.0, + "step": 2262 + }, + { + "epoch": 0.24851746101471558, + "grad_norm": 2.30853271484375, + "learning_rate": 1e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.7108296155929565, + "num_tokens": 57601802.0, + "step": 2263 + }, + { + "epoch": 0.24862727871732923, + "grad_norm": 2.0910604000091553, + "learning_rate": 1e-06, + "loss": 1.0582, + "mean_token_accuracy": 0.681854248046875, + "num_tokens": 57633615.0, + "step": 2264 + }, + { + "epoch": 0.2487370964199429, + "grad_norm": 2.3927953243255615, + "learning_rate": 1e-06, + "loss": 1.0754, + "mean_token_accuracy": 0.6889097690582275, + "num_tokens": 57658194.0, + "step": 2265 + }, + { + "epoch": 0.24884691412255655, + "grad_norm": 2.5363805294036865, + "learning_rate": 1e-06, + "loss": 0.9975, + "mean_token_accuracy": 0.6968972086906433, + "num_tokens": 57678457.0, + "step": 2266 + }, + { + "epoch": 0.24895673182517022, + "grad_norm": 2.490929365158081, + "learning_rate": 1e-06, + "loss": 1.0275, + "mean_token_accuracy": 0.6954452991485596, + "num_tokens": 57700269.0, + "step": 2267 + }, + { + "epoch": 0.24906654952778387, + "grad_norm": 2.212941884994507, + "learning_rate": 1e-06, + "loss": 1.1088, + "mean_token_accuracy": 0.6759142875671387, + "num_tokens": 57727924.0, + "step": 2268 + }, + { + "epoch": 0.24917636723039754, + "grad_norm": 2.216468334197998, + "learning_rate": 1e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.6998683214187622, + "num_tokens": 57752559.0, + "step": 2269 + }, + { + "epoch": 0.2492861849330112, + "grad_norm": 2.400515079498291, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.709591269493103, + "num_tokens": 57773892.0, + "step": 2270 + }, + { + "epoch": 0.24939600263562486, + "grad_norm": 2.086674928665161, + "learning_rate": 1e-06, + "loss": 1.1065, + "mean_token_accuracy": 0.6734647750854492, + "num_tokens": 57805137.0, + "step": 2271 + }, + { + "epoch": 0.24950582033823854, + "grad_norm": 2.128694534301758, + "learning_rate": 1e-06, + "loss": 1.0511, + "mean_token_accuracy": 0.6938154101371765, + "num_tokens": 57832861.0, + "step": 2272 + }, + { + "epoch": 0.24961563804085218, + "grad_norm": 2.1334335803985596, + "learning_rate": 1e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7184424996376038, + "num_tokens": 57859928.0, + "step": 2273 + }, + { + "epoch": 0.24972545574346586, + "grad_norm": 2.262615442276001, + "learning_rate": 1e-06, + "loss": 1.1191, + "mean_token_accuracy": 0.6847491264343262, + "num_tokens": 57885801.0, + "step": 2274 + }, + { + "epoch": 0.2498352734460795, + "grad_norm": 2.4192276000976562, + "learning_rate": 1e-06, + "loss": 1.0681, + "mean_token_accuracy": 0.6774304509162903, + "num_tokens": 57909300.0, + "step": 2275 + }, + { + "epoch": 0.24994509114869318, + "grad_norm": 2.3743910789489746, + "learning_rate": 1e-06, + "loss": 1.0678, + "mean_token_accuracy": 0.6827106475830078, + "num_tokens": 57933885.0, + "step": 2276 + }, + { + "epoch": 0.2500549088513068, + "grad_norm": 2.2147929668426514, + "learning_rate": 1e-06, + "loss": 1.017, + "mean_token_accuracy": 0.693699061870575, + "num_tokens": 57959863.0, + "step": 2277 + }, + { + "epoch": 0.25016472655392047, + "grad_norm": 2.117208242416382, + "learning_rate": 1e-06, + "loss": 0.892, + "mean_token_accuracy": 0.7252624034881592, + "num_tokens": 57985923.0, + "step": 2278 + }, + { + "epoch": 0.25027454425653417, + "grad_norm": 2.3874189853668213, + "learning_rate": 1e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.7140184640884399, + "num_tokens": 58007704.0, + "step": 2279 + }, + { + "epoch": 0.2503843619591478, + "grad_norm": 2.240929126739502, + "learning_rate": 1e-06, + "loss": 1.034, + "mean_token_accuracy": 0.692339301109314, + "num_tokens": 58033498.0, + "step": 2280 + }, + { + "epoch": 0.25049417966176146, + "grad_norm": 2.278768301010132, + "learning_rate": 1e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.6985231637954712, + "num_tokens": 58058536.0, + "step": 2281 + }, + { + "epoch": 0.25060399736437516, + "grad_norm": 2.381763458251953, + "learning_rate": 1e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.7092562317848206, + "num_tokens": 58081416.0, + "step": 2282 + }, + { + "epoch": 0.2507138150669888, + "grad_norm": 2.450643539428711, + "learning_rate": 1e-06, + "loss": 1.1135, + "mean_token_accuracy": 0.6678738594055176, + "num_tokens": 58104927.0, + "step": 2283 + }, + { + "epoch": 0.25082363276960246, + "grad_norm": 2.1662724018096924, + "learning_rate": 1e-06, + "loss": 0.9779, + "mean_token_accuracy": 0.705115795135498, + "num_tokens": 58131672.0, + "step": 2284 + }, + { + "epoch": 0.2509334504722161, + "grad_norm": 2.162834882736206, + "learning_rate": 1e-06, + "loss": 0.9373, + "mean_token_accuracy": 0.7180413007736206, + "num_tokens": 58158040.0, + "step": 2285 + }, + { + "epoch": 0.2510432681748298, + "grad_norm": 2.2442386150360107, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7053059339523315, + "num_tokens": 58181301.0, + "step": 2286 + }, + { + "epoch": 0.25115308587744345, + "grad_norm": 2.1358461380004883, + "learning_rate": 1e-06, + "loss": 0.9943, + "mean_token_accuracy": 0.7086224555969238, + "num_tokens": 58206854.0, + "step": 2287 + }, + { + "epoch": 0.2512629035800571, + "grad_norm": 2.200777053833008, + "learning_rate": 1e-06, + "loss": 1.0384, + "mean_token_accuracy": 0.6885011196136475, + "num_tokens": 58235100.0, + "step": 2288 + }, + { + "epoch": 0.25137272128267074, + "grad_norm": 1.9979554414749146, + "learning_rate": 1e-06, + "loss": 1.0039, + "mean_token_accuracy": 0.6944236159324646, + "num_tokens": 58266842.0, + "step": 2289 + }, + { + "epoch": 0.25148253898528444, + "grad_norm": 2.2263991832733154, + "learning_rate": 1e-06, + "loss": 1.0986, + "mean_token_accuracy": 0.6756307482719421, + "num_tokens": 58293877.0, + "step": 2290 + }, + { + "epoch": 0.2515923566878981, + "grad_norm": 2.2311887741088867, + "learning_rate": 1e-06, + "loss": 1.0793, + "mean_token_accuracy": 0.6838686466217041, + "num_tokens": 58320113.0, + "step": 2291 + }, + { + "epoch": 0.25170217439051173, + "grad_norm": 2.338329315185547, + "learning_rate": 1e-06, + "loss": 0.96, + "mean_token_accuracy": 0.7042336463928223, + "num_tokens": 58342852.0, + "step": 2292 + }, + { + "epoch": 0.25181199209312544, + "grad_norm": 2.4495089054107666, + "learning_rate": 1e-06, + "loss": 0.8887, + "mean_token_accuracy": 0.7169333100318909, + "num_tokens": 58362511.0, + "step": 2293 + }, + { + "epoch": 0.2519218097957391, + "grad_norm": 2.4915101528167725, + "learning_rate": 1e-06, + "loss": 1.0061, + "mean_token_accuracy": 0.7025662660598755, + "num_tokens": 58384096.0, + "step": 2294 + }, + { + "epoch": 0.25203162749835273, + "grad_norm": 2.6110215187072754, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.7095805406570435, + "num_tokens": 58404629.0, + "step": 2295 + }, + { + "epoch": 0.2521414452009664, + "grad_norm": 2.207152843475342, + "learning_rate": 1e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.6981843709945679, + "num_tokens": 58431544.0, + "step": 2296 + }, + { + "epoch": 0.2522512629035801, + "grad_norm": 2.3555071353912354, + "learning_rate": 1e-06, + "loss": 1.1159, + "mean_token_accuracy": 0.6687847375869751, + "num_tokens": 58460179.0, + "step": 2297 + }, + { + "epoch": 0.2523610806061937, + "grad_norm": 2.047738790512085, + "learning_rate": 1e-06, + "loss": 1.1267, + "mean_token_accuracy": 0.6625247001647949, + "num_tokens": 58491518.0, + "step": 2298 + }, + { + "epoch": 0.25247089830880737, + "grad_norm": 2.1345090866088867, + "learning_rate": 1e-06, + "loss": 1.0126, + "mean_token_accuracy": 0.6876048445701599, + "num_tokens": 58519745.0, + "step": 2299 + }, + { + "epoch": 0.252580716011421, + "grad_norm": 2.1624441146850586, + "learning_rate": 1e-06, + "loss": 1.0407, + "mean_token_accuracy": 0.6889692544937134, + "num_tokens": 58548882.0, + "step": 2300 + }, + { + "epoch": 0.2526905337140347, + "grad_norm": 2.1665737628936768, + "learning_rate": 1e-06, + "loss": 1.0591, + "mean_token_accuracy": 0.6890525221824646, + "num_tokens": 58575946.0, + "step": 2301 + }, + { + "epoch": 0.25280035141664836, + "grad_norm": 2.068101644515991, + "learning_rate": 1e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.7093935012817383, + "num_tokens": 58603395.0, + "step": 2302 + }, + { + "epoch": 0.252910169119262, + "grad_norm": 2.2495017051696777, + "learning_rate": 1e-06, + "loss": 0.8686, + "mean_token_accuracy": 0.7308766841888428, + "num_tokens": 58628012.0, + "step": 2303 + }, + { + "epoch": 0.2530199868218757, + "grad_norm": 1.991881012916565, + "learning_rate": 1e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7021965980529785, + "num_tokens": 58658426.0, + "step": 2304 + }, + { + "epoch": 0.25312980452448935, + "grad_norm": 2.612122058868408, + "learning_rate": 1e-06, + "loss": 0.9939, + "mean_token_accuracy": 0.7002701163291931, + "num_tokens": 58683656.0, + "step": 2305 + }, + { + "epoch": 0.253239622227103, + "grad_norm": 2.2138285636901855, + "learning_rate": 1e-06, + "loss": 1.0646, + "mean_token_accuracy": 0.6776527166366577, + "num_tokens": 58711491.0, + "step": 2306 + }, + { + "epoch": 0.25334943992971665, + "grad_norm": 2.3952696323394775, + "learning_rate": 1e-06, + "loss": 1.0699, + "mean_token_accuracy": 0.6906834840774536, + "num_tokens": 58734039.0, + "step": 2307 + }, + { + "epoch": 0.25345925763233035, + "grad_norm": 2.21943736076355, + "learning_rate": 1e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.6966607570648193, + "num_tokens": 58761988.0, + "step": 2308 + }, + { + "epoch": 0.253569075334944, + "grad_norm": 2.1469717025756836, + "learning_rate": 1e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.6993134021759033, + "num_tokens": 58789913.0, + "step": 2309 + }, + { + "epoch": 0.25367889303755764, + "grad_norm": 1.9856021404266357, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7086913585662842, + "num_tokens": 58819121.0, + "step": 2310 + }, + { + "epoch": 0.25378871074017134, + "grad_norm": 2.2475547790527344, + "learning_rate": 1e-06, + "loss": 1.0098, + "mean_token_accuracy": 0.6995275020599365, + "num_tokens": 58845386.0, + "step": 2311 + }, + { + "epoch": 0.253898528442785, + "grad_norm": 2.1241073608398438, + "learning_rate": 1e-06, + "loss": 0.9673, + "mean_token_accuracy": 0.7098251581192017, + "num_tokens": 58871503.0, + "step": 2312 + }, + { + "epoch": 0.25400834614539863, + "grad_norm": 2.0060477256774902, + "learning_rate": 1e-06, + "loss": 1.0122, + "mean_token_accuracy": 0.6968752145767212, + "num_tokens": 58902261.0, + "step": 2313 + }, + { + "epoch": 0.2541181638480123, + "grad_norm": 2.39290714263916, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7135666012763977, + "num_tokens": 58924703.0, + "step": 2314 + }, + { + "epoch": 0.254227981550626, + "grad_norm": 2.19722580909729, + "learning_rate": 1e-06, + "loss": 1.0151, + "mean_token_accuracy": 0.6895421743392944, + "num_tokens": 58950660.0, + "step": 2315 + }, + { + "epoch": 0.2543377992532396, + "grad_norm": 2.342994451522827, + "learning_rate": 1e-06, + "loss": 0.9974, + "mean_token_accuracy": 0.7015764713287354, + "num_tokens": 58975171.0, + "step": 2316 + }, + { + "epoch": 0.2544476169558533, + "grad_norm": 2.4744365215301514, + "learning_rate": 1e-06, + "loss": 1.0178, + "mean_token_accuracy": 0.6903446912765503, + "num_tokens": 58997115.0, + "step": 2317 + }, + { + "epoch": 0.2545574346584669, + "grad_norm": 2.268975019454956, + "learning_rate": 1e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.6977630257606506, + "num_tokens": 59021532.0, + "step": 2318 + }, + { + "epoch": 0.2546672523610806, + "grad_norm": 2.5052735805511475, + "learning_rate": 1e-06, + "loss": 0.9316, + "mean_token_accuracy": 0.7253042459487915, + "num_tokens": 59042168.0, + "step": 2319 + }, + { + "epoch": 0.25477707006369427, + "grad_norm": 2.168644428253174, + "learning_rate": 1e-06, + "loss": 1.0278, + "mean_token_accuracy": 0.7005698680877686, + "num_tokens": 59067329.0, + "step": 2320 + }, + { + "epoch": 0.2548868877663079, + "grad_norm": 2.177804470062256, + "learning_rate": 1e-06, + "loss": 0.9971, + "mean_token_accuracy": 0.6992002725601196, + "num_tokens": 59093591.0, + "step": 2321 + }, + { + "epoch": 0.2549967054689216, + "grad_norm": 2.4761931896209717, + "learning_rate": 1e-06, + "loss": 0.9912, + "mean_token_accuracy": 0.6987971663475037, + "num_tokens": 59114438.0, + "step": 2322 + }, + { + "epoch": 0.25510652317153526, + "grad_norm": 2.2849156856536865, + "learning_rate": 1e-06, + "loss": 1.0759, + "mean_token_accuracy": 0.6750848293304443, + "num_tokens": 59137911.0, + "step": 2323 + }, + { + "epoch": 0.2552163408741489, + "grad_norm": 2.2372992038726807, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.7032546401023865, + "num_tokens": 59162390.0, + "step": 2324 + }, + { + "epoch": 0.25532615857676255, + "grad_norm": 2.152750015258789, + "learning_rate": 1e-06, + "loss": 1.0139, + "mean_token_accuracy": 0.6993477940559387, + "num_tokens": 59188050.0, + "step": 2325 + }, + { + "epoch": 0.25543597627937625, + "grad_norm": 2.4205760955810547, + "learning_rate": 1e-06, + "loss": 0.9139, + "mean_token_accuracy": 0.7225809693336487, + "num_tokens": 59208752.0, + "step": 2326 + }, + { + "epoch": 0.2555457939819899, + "grad_norm": 2.0856857299804688, + "learning_rate": 1e-06, + "loss": 1.131, + "mean_token_accuracy": 0.6651812195777893, + "num_tokens": 59237985.0, + "step": 2327 + }, + { + "epoch": 0.25565561168460355, + "grad_norm": 2.127561092376709, + "learning_rate": 1e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.6966229677200317, + "num_tokens": 59264812.0, + "step": 2328 + }, + { + "epoch": 0.25576542938721725, + "grad_norm": 2.16644549369812, + "learning_rate": 1e-06, + "loss": 0.8809, + "mean_token_accuracy": 0.7252101898193359, + "num_tokens": 59289073.0, + "step": 2329 + }, + { + "epoch": 0.2558752470898309, + "grad_norm": 2.276198387145996, + "learning_rate": 1e-06, + "loss": 1.0771, + "mean_token_accuracy": 0.6759630441665649, + "num_tokens": 59315551.0, + "step": 2330 + }, + { + "epoch": 0.25598506479244454, + "grad_norm": 2.281297445297241, + "learning_rate": 1e-06, + "loss": 1.0109, + "mean_token_accuracy": 0.7076669931411743, + "num_tokens": 59339916.0, + "step": 2331 + }, + { + "epoch": 0.2560948824950582, + "grad_norm": 2.0330862998962402, + "learning_rate": 1e-06, + "loss": 1.0556, + "mean_token_accuracy": 0.6826062202453613, + "num_tokens": 59370389.0, + "step": 2332 + }, + { + "epoch": 0.2562047001976719, + "grad_norm": 2.5023446083068848, + "learning_rate": 1e-06, + "loss": 0.9105, + "mean_token_accuracy": 0.7162626385688782, + "num_tokens": 59391737.0, + "step": 2333 + }, + { + "epoch": 0.25631451790028553, + "grad_norm": 2.3909573554992676, + "learning_rate": 1e-06, + "loss": 1.0459, + "mean_token_accuracy": 0.6929522752761841, + "num_tokens": 59415523.0, + "step": 2334 + }, + { + "epoch": 0.2564243356028992, + "grad_norm": 2.3460030555725098, + "learning_rate": 1e-06, + "loss": 1.0598, + "mean_token_accuracy": 0.6799250841140747, + "num_tokens": 59440413.0, + "step": 2335 + }, + { + "epoch": 0.2565341533055128, + "grad_norm": 2.1457550525665283, + "learning_rate": 1e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.7043236494064331, + "num_tokens": 59469896.0, + "step": 2336 + }, + { + "epoch": 0.2566439710081265, + "grad_norm": 2.471937656402588, + "learning_rate": 1e-06, + "loss": 0.9365, + "mean_token_accuracy": 0.7130293846130371, + "num_tokens": 59491930.0, + "step": 2337 + }, + { + "epoch": 0.25675378871074017, + "grad_norm": 2.160944700241089, + "learning_rate": 1e-06, + "loss": 1.0838, + "mean_token_accuracy": 0.6789169907569885, + "num_tokens": 59521831.0, + "step": 2338 + }, + { + "epoch": 0.2568636064133538, + "grad_norm": 2.2270660400390625, + "learning_rate": 1e-06, + "loss": 0.9544, + "mean_token_accuracy": 0.7076727151870728, + "num_tokens": 59547444.0, + "step": 2339 + }, + { + "epoch": 0.2569734241159675, + "grad_norm": 2.312293529510498, + "learning_rate": 1e-06, + "loss": 1.0255, + "mean_token_accuracy": 0.6884998083114624, + "num_tokens": 59571063.0, + "step": 2340 + }, + { + "epoch": 0.25708324181858117, + "grad_norm": 2.1456356048583984, + "learning_rate": 1e-06, + "loss": 1.0965, + "mean_token_accuracy": 0.6713017225265503, + "num_tokens": 59599418.0, + "step": 2341 + }, + { + "epoch": 0.2571930595211948, + "grad_norm": 2.3099517822265625, + "learning_rate": 1e-06, + "loss": 1.0323, + "mean_token_accuracy": 0.6883401870727539, + "num_tokens": 59623557.0, + "step": 2342 + }, + { + "epoch": 0.25730287722380846, + "grad_norm": 2.2512669563293457, + "learning_rate": 1e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.7125765681266785, + "num_tokens": 59647781.0, + "step": 2343 + }, + { + "epoch": 0.25741269492642216, + "grad_norm": 2.33520245552063, + "learning_rate": 1e-06, + "loss": 1.0489, + "mean_token_accuracy": 0.68218994140625, + "num_tokens": 59671972.0, + "step": 2344 + }, + { + "epoch": 0.2575225126290358, + "grad_norm": 2.2331037521362305, + "learning_rate": 1e-06, + "loss": 0.9808, + "mean_token_accuracy": 0.7101250886917114, + "num_tokens": 59697954.0, + "step": 2345 + }, + { + "epoch": 0.25763233033164945, + "grad_norm": 2.3663363456726074, + "learning_rate": 1e-06, + "loss": 1.0201, + "mean_token_accuracy": 0.686437726020813, + "num_tokens": 59720621.0, + "step": 2346 + }, + { + "epoch": 0.2577421480342631, + "grad_norm": 2.606313705444336, + "learning_rate": 1e-06, + "loss": 0.9603, + "mean_token_accuracy": 0.7114814519882202, + "num_tokens": 59739800.0, + "step": 2347 + }, + { + "epoch": 0.2578519657368768, + "grad_norm": 2.3065104484558105, + "learning_rate": 1e-06, + "loss": 1.0201, + "mean_token_accuracy": 0.6895430684089661, + "num_tokens": 59764729.0, + "step": 2348 + }, + { + "epoch": 0.25796178343949044, + "grad_norm": 2.077598810195923, + "learning_rate": 1e-06, + "loss": 1.0951, + "mean_token_accuracy": 0.672967791557312, + "num_tokens": 59794524.0, + "step": 2349 + }, + { + "epoch": 0.2580716011421041, + "grad_norm": 2.3052992820739746, + "learning_rate": 1e-06, + "loss": 1.0071, + "mean_token_accuracy": 0.6995306015014648, + "num_tokens": 59818053.0, + "step": 2350 + }, + { + "epoch": 0.2581814188447178, + "grad_norm": 2.2360196113586426, + "learning_rate": 1e-06, + "loss": 1.0452, + "mean_token_accuracy": 0.6912558674812317, + "num_tokens": 59843763.0, + "step": 2351 + }, + { + "epoch": 0.25829123654733144, + "grad_norm": 2.199857473373413, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7032891511917114, + "num_tokens": 59870835.0, + "step": 2352 + }, + { + "epoch": 0.2584010542499451, + "grad_norm": 2.1430892944335938, + "learning_rate": 1e-06, + "loss": 1.0014, + "mean_token_accuracy": 0.7055928111076355, + "num_tokens": 59900833.0, + "step": 2353 + }, + { + "epoch": 0.25851087195255873, + "grad_norm": 2.5912609100341797, + "learning_rate": 1e-06, + "loss": 0.848, + "mean_token_accuracy": 0.7350099682807922, + "num_tokens": 59919304.0, + "step": 2354 + }, + { + "epoch": 0.25862068965517243, + "grad_norm": 2.199904441833496, + "learning_rate": 1e-06, + "loss": 1.0108, + "mean_token_accuracy": 0.6996583342552185, + "num_tokens": 59945865.0, + "step": 2355 + }, + { + "epoch": 0.2587305073577861, + "grad_norm": 2.382692575454712, + "learning_rate": 1e-06, + "loss": 1.001, + "mean_token_accuracy": 0.6954026222229004, + "num_tokens": 59966901.0, + "step": 2356 + }, + { + "epoch": 0.2588403250603997, + "grad_norm": 2.568842887878418, + "learning_rate": 1e-06, + "loss": 0.9413, + "mean_token_accuracy": 0.7105191349983215, + "num_tokens": 59987433.0, + "step": 2357 + }, + { + "epoch": 0.2589501427630134, + "grad_norm": 2.1504931449890137, + "learning_rate": 1e-06, + "loss": 1.0384, + "mean_token_accuracy": 0.6833164691925049, + "num_tokens": 60014276.0, + "step": 2358 + }, + { + "epoch": 0.25905996046562707, + "grad_norm": 2.1996798515319824, + "learning_rate": 1e-06, + "loss": 1.0167, + "mean_token_accuracy": 0.6968355774879456, + "num_tokens": 60040816.0, + "step": 2359 + }, + { + "epoch": 0.2591697781682407, + "grad_norm": 2.0144412517547607, + "learning_rate": 1e-06, + "loss": 1.0223, + "mean_token_accuracy": 0.6890722513198853, + "num_tokens": 60070512.0, + "step": 2360 + }, + { + "epoch": 0.25927959587085436, + "grad_norm": 2.5949251651763916, + "learning_rate": 1e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.7154635190963745, + "num_tokens": 60088767.0, + "step": 2361 + }, + { + "epoch": 0.25938941357346806, + "grad_norm": 2.1547138690948486, + "learning_rate": 1e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.7133059501647949, + "num_tokens": 60113484.0, + "step": 2362 + }, + { + "epoch": 0.2594992312760817, + "grad_norm": 2.3571600914001465, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.7010351419448853, + "num_tokens": 60136728.0, + "step": 2363 + }, + { + "epoch": 0.25960904897869536, + "grad_norm": 2.2284293174743652, + "learning_rate": 1e-06, + "loss": 1.0377, + "mean_token_accuracy": 0.6942357420921326, + "num_tokens": 60162468.0, + "step": 2364 + }, + { + "epoch": 0.259718866681309, + "grad_norm": 2.2700769901275635, + "learning_rate": 1e-06, + "loss": 1.0821, + "mean_token_accuracy": 0.6733648777008057, + "num_tokens": 60191364.0, + "step": 2365 + }, + { + "epoch": 0.2598286843839227, + "grad_norm": 2.0843446254730225, + "learning_rate": 1e-06, + "loss": 1.0619, + "mean_token_accuracy": 0.6852229833602905, + "num_tokens": 60219824.0, + "step": 2366 + }, + { + "epoch": 0.25993850208653635, + "grad_norm": 2.316572666168213, + "learning_rate": 1e-06, + "loss": 1.0031, + "mean_token_accuracy": 0.7010010480880737, + "num_tokens": 60243732.0, + "step": 2367 + }, + { + "epoch": 0.26004831978915, + "grad_norm": 2.332139492034912, + "learning_rate": 1e-06, + "loss": 1.0536, + "mean_token_accuracy": 0.682370662689209, + "num_tokens": 60268773.0, + "step": 2368 + }, + { + "epoch": 0.2601581374917637, + "grad_norm": 2.1960387229919434, + "learning_rate": 1e-06, + "loss": 1.0061, + "mean_token_accuracy": 0.6947987079620361, + "num_tokens": 60294127.0, + "step": 2369 + }, + { + "epoch": 0.26026795519437734, + "grad_norm": 2.4962234497070312, + "learning_rate": 1e-06, + "loss": 1.0107, + "mean_token_accuracy": 0.6912076473236084, + "num_tokens": 60316824.0, + "step": 2370 + }, + { + "epoch": 0.260377772896991, + "grad_norm": 2.6251654624938965, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.7037768959999084, + "num_tokens": 60336826.0, + "step": 2371 + }, + { + "epoch": 0.26048759059960463, + "grad_norm": 2.04841947555542, + "learning_rate": 1e-06, + "loss": 1.0719, + "mean_token_accuracy": 0.6877825260162354, + "num_tokens": 60366933.0, + "step": 2372 + }, + { + "epoch": 0.26059740830221834, + "grad_norm": 2.172818660736084, + "learning_rate": 1e-06, + "loss": 0.8767, + "mean_token_accuracy": 0.7241872549057007, + "num_tokens": 60390850.0, + "step": 2373 + }, + { + "epoch": 0.260707226004832, + "grad_norm": 2.3360142707824707, + "learning_rate": 1e-06, + "loss": 1.0443, + "mean_token_accuracy": 0.6882160902023315, + "num_tokens": 60414987.0, + "step": 2374 + }, + { + "epoch": 0.26081704370744563, + "grad_norm": 2.3489487171173096, + "learning_rate": 1e-06, + "loss": 1.1138, + "mean_token_accuracy": 0.6681577563285828, + "num_tokens": 60438871.0, + "step": 2375 + }, + { + "epoch": 0.2609268614100593, + "grad_norm": 2.4430007934570312, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7138291001319885, + "num_tokens": 60459708.0, + "step": 2376 + }, + { + "epoch": 0.261036679112673, + "grad_norm": 2.232191562652588, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.6997986435890198, + "num_tokens": 60485765.0, + "step": 2377 + }, + { + "epoch": 0.2611464968152866, + "grad_norm": 2.5931296348571777, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.7064725160598755, + "num_tokens": 60505466.0, + "step": 2378 + }, + { + "epoch": 0.26125631451790027, + "grad_norm": 2.243267297744751, + "learning_rate": 1e-06, + "loss": 1.0667, + "mean_token_accuracy": 0.6817998290061951, + "num_tokens": 60531929.0, + "step": 2379 + }, + { + "epoch": 0.26136613222051397, + "grad_norm": 2.325650215148926, + "learning_rate": 1e-06, + "loss": 1.0256, + "mean_token_accuracy": 0.6892170906066895, + "num_tokens": 60558580.0, + "step": 2380 + }, + { + "epoch": 0.2614759499231276, + "grad_norm": 2.5437052249908447, + "learning_rate": 1e-06, + "loss": 0.9874, + "mean_token_accuracy": 0.7061255574226379, + "num_tokens": 60579318.0, + "step": 2381 + }, + { + "epoch": 0.26158576762574126, + "grad_norm": 2.317988157272339, + "learning_rate": 1e-06, + "loss": 1.0035, + "mean_token_accuracy": 0.7032709717750549, + "num_tokens": 60606718.0, + "step": 2382 + }, + { + "epoch": 0.2616955853283549, + "grad_norm": 2.5426928997039795, + "learning_rate": 1e-06, + "loss": 0.9989, + "mean_token_accuracy": 0.6961389183998108, + "num_tokens": 60627371.0, + "step": 2383 + }, + { + "epoch": 0.2618054030309686, + "grad_norm": 2.0001790523529053, + "learning_rate": 1e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.700395941734314, + "num_tokens": 60660537.0, + "step": 2384 + }, + { + "epoch": 0.26191522073358225, + "grad_norm": 2.208789348602295, + "learning_rate": 1e-06, + "loss": 1.0425, + "mean_token_accuracy": 0.6878525614738464, + "num_tokens": 60686412.0, + "step": 2385 + }, + { + "epoch": 0.2620250384361959, + "grad_norm": 2.280169725418091, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7084869742393494, + "num_tokens": 60712471.0, + "step": 2386 + }, + { + "epoch": 0.2621348561388096, + "grad_norm": 2.591559886932373, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.70530104637146, + "num_tokens": 60732505.0, + "step": 2387 + }, + { + "epoch": 0.26224467384142325, + "grad_norm": 2.3462750911712646, + "learning_rate": 1e-06, + "loss": 1.0149, + "mean_token_accuracy": 0.6890501976013184, + "num_tokens": 60755868.0, + "step": 2388 + }, + { + "epoch": 0.2623544915440369, + "grad_norm": 2.0285916328430176, + "learning_rate": 1e-06, + "loss": 1.0351, + "mean_token_accuracy": 0.6893708109855652, + "num_tokens": 60787450.0, + "step": 2389 + }, + { + "epoch": 0.26246430924665054, + "grad_norm": 2.516775608062744, + "learning_rate": 1e-06, + "loss": 1.0088, + "mean_token_accuracy": 0.6937263011932373, + "num_tokens": 60809851.0, + "step": 2390 + }, + { + "epoch": 0.26257412694926424, + "grad_norm": 2.3861491680145264, + "learning_rate": 1e-06, + "loss": 0.978, + "mean_token_accuracy": 0.7029947638511658, + "num_tokens": 60831819.0, + "step": 2391 + }, + { + "epoch": 0.2626839446518779, + "grad_norm": 2.1343531608581543, + "learning_rate": 1e-06, + "loss": 1.0372, + "mean_token_accuracy": 0.685755729675293, + "num_tokens": 60860800.0, + "step": 2392 + }, + { + "epoch": 0.26279376235449153, + "grad_norm": 2.3226122856140137, + "learning_rate": 1e-06, + "loss": 1.0348, + "mean_token_accuracy": 0.6863731145858765, + "num_tokens": 60884866.0, + "step": 2393 + }, + { + "epoch": 0.2629035800571052, + "grad_norm": 2.2029662132263184, + "learning_rate": 1e-06, + "loss": 1.0144, + "mean_token_accuracy": 0.6998410820960999, + "num_tokens": 60911693.0, + "step": 2394 + }, + { + "epoch": 0.2630133977597189, + "grad_norm": 2.282538414001465, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7120206952095032, + "num_tokens": 60934861.0, + "step": 2395 + }, + { + "epoch": 0.2631232154623325, + "grad_norm": 1.9005523920059204, + "learning_rate": 1e-06, + "loss": 1.0272, + "mean_token_accuracy": 0.6926788091659546, + "num_tokens": 60967757.0, + "step": 2396 + }, + { + "epoch": 0.2632330331649462, + "grad_norm": 2.1229817867279053, + "learning_rate": 1e-06, + "loss": 1.0275, + "mean_token_accuracy": 0.6887296438217163, + "num_tokens": 60995976.0, + "step": 2397 + }, + { + "epoch": 0.2633428508675599, + "grad_norm": 2.1617705821990967, + "learning_rate": 1e-06, + "loss": 1.0004, + "mean_token_accuracy": 0.7053908109664917, + "num_tokens": 61022838.0, + "step": 2398 + }, + { + "epoch": 0.2634526685701735, + "grad_norm": 2.191964864730835, + "learning_rate": 1e-06, + "loss": 1.0132, + "mean_token_accuracy": 0.6925303936004639, + "num_tokens": 61050864.0, + "step": 2399 + }, + { + "epoch": 0.26356248627278717, + "grad_norm": 2.051718235015869, + "learning_rate": 1e-06, + "loss": 1.0901, + "mean_token_accuracy": 0.6733659505844116, + "num_tokens": 61082684.0, + "step": 2400 + }, + { + "epoch": 0.2636723039754008, + "grad_norm": 2.0313775539398193, + "learning_rate": 1e-06, + "loss": 1.0681, + "mean_token_accuracy": 0.6767472624778748, + "num_tokens": 61112993.0, + "step": 2401 + }, + { + "epoch": 0.2637821216780145, + "grad_norm": 2.111274242401123, + "learning_rate": 1e-06, + "loss": 1.04, + "mean_token_accuracy": 0.6929904222488403, + "num_tokens": 61140427.0, + "step": 2402 + }, + { + "epoch": 0.26389193938062816, + "grad_norm": 2.0987541675567627, + "learning_rate": 1e-06, + "loss": 0.9281, + "mean_token_accuracy": 0.7161253690719604, + "num_tokens": 61168034.0, + "step": 2403 + }, + { + "epoch": 0.2640017570832418, + "grad_norm": 2.7007477283477783, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7112494111061096, + "num_tokens": 61186042.0, + "step": 2404 + }, + { + "epoch": 0.2641115747858555, + "grad_norm": 2.3412179946899414, + "learning_rate": 1e-06, + "loss": 0.9487, + "mean_token_accuracy": 0.7071148753166199, + "num_tokens": 61210272.0, + "step": 2405 + }, + { + "epoch": 0.26422139248846915, + "grad_norm": 2.3181936740875244, + "learning_rate": 1e-06, + "loss": 1.0523, + "mean_token_accuracy": 0.6927189826965332, + "num_tokens": 61235286.0, + "step": 2406 + }, + { + "epoch": 0.2643312101910828, + "grad_norm": 2.637655735015869, + "learning_rate": 1e-06, + "loss": 0.9958, + "mean_token_accuracy": 0.7012860774993896, + "num_tokens": 61257252.0, + "step": 2407 + }, + { + "epoch": 0.26444102789369645, + "grad_norm": 2.2378125190734863, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7229098081588745, + "num_tokens": 61281766.0, + "step": 2408 + }, + { + "epoch": 0.26455084559631015, + "grad_norm": 2.2337753772735596, + "learning_rate": 1e-06, + "loss": 1.0007, + "mean_token_accuracy": 0.6897692680358887, + "num_tokens": 61307816.0, + "step": 2409 + }, + { + "epoch": 0.2646606632989238, + "grad_norm": 2.143893003463745, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7115519046783447, + "num_tokens": 61336148.0, + "step": 2410 + }, + { + "epoch": 0.26477048100153744, + "grad_norm": 2.337064266204834, + "learning_rate": 1e-06, + "loss": 1.0382, + "mean_token_accuracy": 0.6931368112564087, + "num_tokens": 61361224.0, + "step": 2411 + }, + { + "epoch": 0.2648802987041511, + "grad_norm": 2.711256265640259, + "learning_rate": 1e-06, + "loss": 1.0748, + "mean_token_accuracy": 0.678679883480072, + "num_tokens": 61381899.0, + "step": 2412 + }, + { + "epoch": 0.2649901164067648, + "grad_norm": 2.3898417949676514, + "learning_rate": 1e-06, + "loss": 1.0827, + "mean_token_accuracy": 0.6743096113204956, + "num_tokens": 61408914.0, + "step": 2413 + }, + { + "epoch": 0.26509993410937843, + "grad_norm": 2.491403579711914, + "learning_rate": 1e-06, + "loss": 1.0135, + "mean_token_accuracy": 0.695758044719696, + "num_tokens": 61431092.0, + "step": 2414 + }, + { + "epoch": 0.2652097518119921, + "grad_norm": 2.6460320949554443, + "learning_rate": 1e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.7040653228759766, + "num_tokens": 61450132.0, + "step": 2415 + }, + { + "epoch": 0.2653195695146058, + "grad_norm": 2.402162790298462, + "learning_rate": 1e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.7068816423416138, + "num_tokens": 61474308.0, + "step": 2416 + }, + { + "epoch": 0.2654293872172194, + "grad_norm": 2.125631093978882, + "learning_rate": 1e-06, + "loss": 1.0681, + "mean_token_accuracy": 0.6757840514183044, + "num_tokens": 61503688.0, + "step": 2417 + }, + { + "epoch": 0.26553920491983307, + "grad_norm": 1.912136435508728, + "learning_rate": 1e-06, + "loss": 1.043, + "mean_token_accuracy": 0.693817138671875, + "num_tokens": 61538580.0, + "step": 2418 + }, + { + "epoch": 0.2656490226224467, + "grad_norm": 2.3089118003845215, + "learning_rate": 1e-06, + "loss": 0.9773, + "mean_token_accuracy": 0.7004050016403198, + "num_tokens": 61563762.0, + "step": 2419 + }, + { + "epoch": 0.2657588403250604, + "grad_norm": 2.2835052013397217, + "learning_rate": 1e-06, + "loss": 1.1566, + "mean_token_accuracy": 0.6693114638328552, + "num_tokens": 61590461.0, + "step": 2420 + }, + { + "epoch": 0.26586865802767407, + "grad_norm": 2.148283004760742, + "learning_rate": 1e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.6999373435974121, + "num_tokens": 61616425.0, + "step": 2421 + }, + { + "epoch": 0.2659784757302877, + "grad_norm": 2.1885178089141846, + "learning_rate": 1e-06, + "loss": 1.0204, + "mean_token_accuracy": 0.6936783790588379, + "num_tokens": 61646592.0, + "step": 2422 + }, + { + "epoch": 0.26608829343290136, + "grad_norm": 2.7485804557800293, + "learning_rate": 1e-06, + "loss": 0.8722, + "mean_token_accuracy": 0.7280651926994324, + "num_tokens": 61662964.0, + "step": 2423 + }, + { + "epoch": 0.26619811113551506, + "grad_norm": 2.238236904144287, + "learning_rate": 1e-06, + "loss": 1.0572, + "mean_token_accuracy": 0.683850884437561, + "num_tokens": 61688351.0, + "step": 2424 + }, + { + "epoch": 0.2663079288381287, + "grad_norm": 2.596652030944824, + "learning_rate": 1e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.712030291557312, + "num_tokens": 61708843.0, + "step": 2425 + }, + { + "epoch": 0.26641774654074235, + "grad_norm": 2.0913383960723877, + "learning_rate": 1e-06, + "loss": 1.0774, + "mean_token_accuracy": 0.6746669411659241, + "num_tokens": 61740875.0, + "step": 2426 + }, + { + "epoch": 0.26652756424335605, + "grad_norm": 2.314711809158325, + "learning_rate": 1e-06, + "loss": 0.9679, + "mean_token_accuracy": 0.7112957239151001, + "num_tokens": 61764719.0, + "step": 2427 + }, + { + "epoch": 0.2666373819459697, + "grad_norm": 1.9373464584350586, + "learning_rate": 1e-06, + "loss": 1.1136, + "mean_token_accuracy": 0.6737120747566223, + "num_tokens": 61800146.0, + "step": 2428 + }, + { + "epoch": 0.26674719964858334, + "grad_norm": 2.3394644260406494, + "learning_rate": 1e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.700330376625061, + "num_tokens": 61823377.0, + "step": 2429 + }, + { + "epoch": 0.266857017351197, + "grad_norm": 2.1578471660614014, + "learning_rate": 1e-06, + "loss": 1.0147, + "mean_token_accuracy": 0.6908031702041626, + "num_tokens": 61848146.0, + "step": 2430 + }, + { + "epoch": 0.2669668350538107, + "grad_norm": 2.3392996788024902, + "learning_rate": 1e-06, + "loss": 1.0193, + "mean_token_accuracy": 0.6944049000740051, + "num_tokens": 61872896.0, + "step": 2431 + }, + { + "epoch": 0.26707665275642434, + "grad_norm": 2.1208109855651855, + "learning_rate": 1e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.7060261964797974, + "num_tokens": 61898205.0, + "step": 2432 + }, + { + "epoch": 0.267186470459038, + "grad_norm": 2.3734512329101562, + "learning_rate": 1e-06, + "loss": 0.8764, + "mean_token_accuracy": 0.7309807538986206, + "num_tokens": 61918643.0, + "step": 2433 + }, + { + "epoch": 0.2672962881616517, + "grad_norm": 2.018649101257324, + "learning_rate": 1e-06, + "loss": 1.0563, + "mean_token_accuracy": 0.6857880353927612, + "num_tokens": 61952206.0, + "step": 2434 + }, + { + "epoch": 0.26740610586426533, + "grad_norm": 2.259467124938965, + "learning_rate": 1e-06, + "loss": 1.1141, + "mean_token_accuracy": 0.6697297096252441, + "num_tokens": 61980597.0, + "step": 2435 + }, + { + "epoch": 0.267515923566879, + "grad_norm": 2.4558799266815186, + "learning_rate": 1e-06, + "loss": 1.0637, + "mean_token_accuracy": 0.6837722063064575, + "num_tokens": 62005276.0, + "step": 2436 + }, + { + "epoch": 0.2676257412694926, + "grad_norm": 2.3144586086273193, + "learning_rate": 1e-06, + "loss": 1.0796, + "mean_token_accuracy": 0.6771861910820007, + "num_tokens": 62029706.0, + "step": 2437 + }, + { + "epoch": 0.2677355589721063, + "grad_norm": 2.1391000747680664, + "learning_rate": 1e-06, + "loss": 1.0419, + "mean_token_accuracy": 0.6863638162612915, + "num_tokens": 62057471.0, + "step": 2438 + }, + { + "epoch": 0.26784537667471997, + "grad_norm": 2.342730760574341, + "learning_rate": 1e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.7164100408554077, + "num_tokens": 62081276.0, + "step": 2439 + }, + { + "epoch": 0.2679551943773336, + "grad_norm": 2.498739719390869, + "learning_rate": 1e-06, + "loss": 1.0076, + "mean_token_accuracy": 0.7069803476333618, + "num_tokens": 62102802.0, + "step": 2440 + }, + { + "epoch": 0.26806501207994726, + "grad_norm": 2.273329734802246, + "learning_rate": 1e-06, + "loss": 1.0188, + "mean_token_accuracy": 0.687412440776825, + "num_tokens": 62128206.0, + "step": 2441 + }, + { + "epoch": 0.26817482978256096, + "grad_norm": 2.087733030319214, + "learning_rate": 1e-06, + "loss": 0.964, + "mean_token_accuracy": 0.7110520005226135, + "num_tokens": 62156138.0, + "step": 2442 + }, + { + "epoch": 0.2682846474851746, + "grad_norm": 2.153963565826416, + "learning_rate": 1e-06, + "loss": 0.9935, + "mean_token_accuracy": 0.6998441219329834, + "num_tokens": 62182124.0, + "step": 2443 + }, + { + "epoch": 0.26839446518778826, + "grad_norm": 2.375410795211792, + "learning_rate": 1e-06, + "loss": 0.9709, + "mean_token_accuracy": 0.7073746919631958, + "num_tokens": 62205949.0, + "step": 2444 + }, + { + "epoch": 0.26850428289040196, + "grad_norm": 2.174130439758301, + "learning_rate": 1e-06, + "loss": 1.0241, + "mean_token_accuracy": 0.6883001923561096, + "num_tokens": 62234019.0, + "step": 2445 + }, + { + "epoch": 0.2686141005930156, + "grad_norm": 2.6714630126953125, + "learning_rate": 1e-06, + "loss": 0.8986, + "mean_token_accuracy": 0.7250884175300598, + "num_tokens": 62253391.0, + "step": 2446 + }, + { + "epoch": 0.26872391829562925, + "grad_norm": 2.0268285274505615, + "learning_rate": 1e-06, + "loss": 0.9974, + "mean_token_accuracy": 0.6959109306335449, + "num_tokens": 62283443.0, + "step": 2447 + }, + { + "epoch": 0.2688337359982429, + "grad_norm": 2.123905658721924, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.6954890489578247, + "num_tokens": 62310190.0, + "step": 2448 + }, + { + "epoch": 0.2689435537008566, + "grad_norm": 2.421372890472412, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7096490859985352, + "num_tokens": 62332607.0, + "step": 2449 + }, + { + "epoch": 0.26905337140347024, + "grad_norm": 2.2982430458068848, + "learning_rate": 1e-06, + "loss": 1.0761, + "mean_token_accuracy": 0.6806154251098633, + "num_tokens": 62358435.0, + "step": 2450 + }, + { + "epoch": 0.2691631891060839, + "grad_norm": 2.292858600616455, + "learning_rate": 1e-06, + "loss": 1.0245, + "mean_token_accuracy": 0.7042579650878906, + "num_tokens": 62384365.0, + "step": 2451 + }, + { + "epoch": 0.26927300680869753, + "grad_norm": 2.188115358352661, + "learning_rate": 1e-06, + "loss": 1.0366, + "mean_token_accuracy": 0.6999967098236084, + "num_tokens": 62411719.0, + "step": 2452 + }, + { + "epoch": 0.26938282451131124, + "grad_norm": 2.2097761631011963, + "learning_rate": 1e-06, + "loss": 1.0108, + "mean_token_accuracy": 0.7016626596450806, + "num_tokens": 62438560.0, + "step": 2453 + }, + { + "epoch": 0.2694926422139249, + "grad_norm": 2.1807169914245605, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.7079119086265564, + "num_tokens": 62463921.0, + "step": 2454 + }, + { + "epoch": 0.26960245991653853, + "grad_norm": 2.5366456508636475, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.708251416683197, + "num_tokens": 62484424.0, + "step": 2455 + }, + { + "epoch": 0.26971227761915223, + "grad_norm": 2.040947198867798, + "learning_rate": 1e-06, + "loss": 1.0064, + "mean_token_accuracy": 0.6981451511383057, + "num_tokens": 62514688.0, + "step": 2456 + }, + { + "epoch": 0.2698220953217659, + "grad_norm": 2.352084159851074, + "learning_rate": 1e-06, + "loss": 0.9971, + "mean_token_accuracy": 0.7002365589141846, + "num_tokens": 62537992.0, + "step": 2457 + }, + { + "epoch": 0.2699319130243795, + "grad_norm": 2.3870368003845215, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.7143721580505371, + "num_tokens": 62559607.0, + "step": 2458 + }, + { + "epoch": 0.27004173072699317, + "grad_norm": 2.570915937423706, + "learning_rate": 1e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.7012612819671631, + "num_tokens": 62578398.0, + "step": 2459 + }, + { + "epoch": 0.27015154842960687, + "grad_norm": 1.9110705852508545, + "learning_rate": 1e-06, + "loss": 0.9991, + "mean_token_accuracy": 0.6927065849304199, + "num_tokens": 62609558.0, + "step": 2460 + }, + { + "epoch": 0.2702613661322205, + "grad_norm": 2.209970474243164, + "learning_rate": 1e-06, + "loss": 1.0282, + "mean_token_accuracy": 0.6878011226654053, + "num_tokens": 62636524.0, + "step": 2461 + }, + { + "epoch": 0.27037118383483416, + "grad_norm": 2.237110137939453, + "learning_rate": 1e-06, + "loss": 0.9595, + "mean_token_accuracy": 0.7058923840522766, + "num_tokens": 62661772.0, + "step": 2462 + }, + { + "epoch": 0.27048100153744786, + "grad_norm": 2.199153423309326, + "learning_rate": 1e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.691929817199707, + "num_tokens": 62687100.0, + "step": 2463 + }, + { + "epoch": 0.2705908192400615, + "grad_norm": 2.2218687534332275, + "learning_rate": 1e-06, + "loss": 1.0203, + "mean_token_accuracy": 0.6925128698348999, + "num_tokens": 62713347.0, + "step": 2464 + }, + { + "epoch": 0.27070063694267515, + "grad_norm": 2.4090235233306885, + "learning_rate": 1e-06, + "loss": 1.0332, + "mean_token_accuracy": 0.6911258697509766, + "num_tokens": 62734613.0, + "step": 2465 + }, + { + "epoch": 0.2708104546452888, + "grad_norm": 2.5582644939422607, + "learning_rate": 1e-06, + "loss": 1.0069, + "mean_token_accuracy": 0.6984022259712219, + "num_tokens": 62754663.0, + "step": 2466 + }, + { + "epoch": 0.2709202723479025, + "grad_norm": 2.359206199645996, + "learning_rate": 1e-06, + "loss": 1.023, + "mean_token_accuracy": 0.6918426752090454, + "num_tokens": 62778316.0, + "step": 2467 + }, + { + "epoch": 0.27103009005051615, + "grad_norm": 2.2268059253692627, + "learning_rate": 1e-06, + "loss": 1.0233, + "mean_token_accuracy": 0.6941536664962769, + "num_tokens": 62805699.0, + "step": 2468 + }, + { + "epoch": 0.2711399077531298, + "grad_norm": 2.091984748840332, + "learning_rate": 1e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.7133872509002686, + "num_tokens": 62832607.0, + "step": 2469 + }, + { + "epoch": 0.27124972545574344, + "grad_norm": 2.098665475845337, + "learning_rate": 1e-06, + "loss": 1.0476, + "mean_token_accuracy": 0.6893832683563232, + "num_tokens": 62862203.0, + "step": 2470 + }, + { + "epoch": 0.27135954315835714, + "grad_norm": 2.391200065612793, + "learning_rate": 1e-06, + "loss": 1.0957, + "mean_token_accuracy": 0.6658835411071777, + "num_tokens": 62886178.0, + "step": 2471 + }, + { + "epoch": 0.2714693608609708, + "grad_norm": 2.2025554180145264, + "learning_rate": 1e-06, + "loss": 0.9276, + "mean_token_accuracy": 0.7122538685798645, + "num_tokens": 62912414.0, + "step": 2472 + }, + { + "epoch": 0.27157917856358443, + "grad_norm": 2.0840206146240234, + "learning_rate": 1e-06, + "loss": 1.1054, + "mean_token_accuracy": 0.6657954454421997, + "num_tokens": 62945542.0, + "step": 2473 + }, + { + "epoch": 0.27168899626619814, + "grad_norm": 2.412594795227051, + "learning_rate": 1e-06, + "loss": 1.0098, + "mean_token_accuracy": 0.6953701972961426, + "num_tokens": 62969475.0, + "step": 2474 + }, + { + "epoch": 0.2717988139688118, + "grad_norm": 2.1079819202423096, + "learning_rate": 1e-06, + "loss": 1.0225, + "mean_token_accuracy": 0.6980664730072021, + "num_tokens": 62997731.0, + "step": 2475 + }, + { + "epoch": 0.2719086316714254, + "grad_norm": 2.442152976989746, + "learning_rate": 1e-06, + "loss": 1.0586, + "mean_token_accuracy": 0.6885068416595459, + "num_tokens": 63021158.0, + "step": 2476 + }, + { + "epoch": 0.2720184493740391, + "grad_norm": 2.11202335357666, + "learning_rate": 1e-06, + "loss": 0.9916, + "mean_token_accuracy": 0.7030656337738037, + "num_tokens": 63048639.0, + "step": 2477 + }, + { + "epoch": 0.2721282670766528, + "grad_norm": 2.2316064834594727, + "learning_rate": 1e-06, + "loss": 1.0015, + "mean_token_accuracy": 0.7043180465698242, + "num_tokens": 63072071.0, + "step": 2478 + }, + { + "epoch": 0.2722380847792664, + "grad_norm": 2.1680474281311035, + "learning_rate": 1e-06, + "loss": 1.0205, + "mean_token_accuracy": 0.6958017349243164, + "num_tokens": 63100157.0, + "step": 2479 + }, + { + "epoch": 0.27234790248188007, + "grad_norm": 2.357790470123291, + "learning_rate": 1e-06, + "loss": 0.9969, + "mean_token_accuracy": 0.6997748613357544, + "num_tokens": 63124234.0, + "step": 2480 + }, + { + "epoch": 0.27245772018449377, + "grad_norm": 2.196176528930664, + "learning_rate": 1e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.699345588684082, + "num_tokens": 63152346.0, + "step": 2481 + }, + { + "epoch": 0.2725675378871074, + "grad_norm": 1.9402227401733398, + "learning_rate": 1e-06, + "loss": 1.0347, + "mean_token_accuracy": 0.6968170404434204, + "num_tokens": 63185489.0, + "step": 2482 + }, + { + "epoch": 0.27267735558972106, + "grad_norm": 2.4553260803222656, + "learning_rate": 1e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.6948608756065369, + "num_tokens": 63207416.0, + "step": 2483 + }, + { + "epoch": 0.2727871732923347, + "grad_norm": 2.012644052505493, + "learning_rate": 1e-06, + "loss": 1.0056, + "mean_token_accuracy": 0.6967149376869202, + "num_tokens": 63237624.0, + "step": 2484 + }, + { + "epoch": 0.2728969909949484, + "grad_norm": 2.4662535190582275, + "learning_rate": 1e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.712525486946106, + "num_tokens": 63258257.0, + "step": 2485 + }, + { + "epoch": 0.27300680869756205, + "grad_norm": 2.205549478530884, + "learning_rate": 1e-06, + "loss": 1.0573, + "mean_token_accuracy": 0.6845786571502686, + "num_tokens": 63283298.0, + "step": 2486 + }, + { + "epoch": 0.2731166264001757, + "grad_norm": 1.9378942251205444, + "learning_rate": 1e-06, + "loss": 1.0822, + "mean_token_accuracy": 0.6777733564376831, + "num_tokens": 63318074.0, + "step": 2487 + }, + { + "epoch": 0.27322644410278935, + "grad_norm": 2.2508678436279297, + "learning_rate": 1e-06, + "loss": 1.0456, + "mean_token_accuracy": 0.6874593496322632, + "num_tokens": 63345266.0, + "step": 2488 + }, + { + "epoch": 0.27333626180540305, + "grad_norm": 2.152892589569092, + "learning_rate": 1e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.7028794288635254, + "num_tokens": 63373633.0, + "step": 2489 + }, + { + "epoch": 0.2734460795080167, + "grad_norm": 2.1056602001190186, + "learning_rate": 1e-06, + "loss": 1.0851, + "mean_token_accuracy": 0.680812656879425, + "num_tokens": 63402346.0, + "step": 2490 + }, + { + "epoch": 0.27355589721063034, + "grad_norm": 2.5956435203552246, + "learning_rate": 1e-06, + "loss": 1.0813, + "mean_token_accuracy": 0.6698183417320251, + "num_tokens": 63422581.0, + "step": 2491 + }, + { + "epoch": 0.27366571491324404, + "grad_norm": 2.3715312480926514, + "learning_rate": 1e-06, + "loss": 0.8679, + "mean_token_accuracy": 0.7327040433883667, + "num_tokens": 63444902.0, + "step": 2492 + }, + { + "epoch": 0.2737755326158577, + "grad_norm": 2.656062364578247, + "learning_rate": 1e-06, + "loss": 0.974, + "mean_token_accuracy": 0.6987572908401489, + "num_tokens": 63465604.0, + "step": 2493 + }, + { + "epoch": 0.27388535031847133, + "grad_norm": 2.2542405128479004, + "learning_rate": 1e-06, + "loss": 0.887, + "mean_token_accuracy": 0.7281198501586914, + "num_tokens": 63490505.0, + "step": 2494 + }, + { + "epoch": 0.273995168021085, + "grad_norm": 2.660231113433838, + "learning_rate": 1e-06, + "loss": 0.937, + "mean_token_accuracy": 0.7137329578399658, + "num_tokens": 63509855.0, + "step": 2495 + }, + { + "epoch": 0.2741049857236987, + "grad_norm": 2.1778783798217773, + "learning_rate": 1e-06, + "loss": 1.0253, + "mean_token_accuracy": 0.6870545148849487, + "num_tokens": 63539606.0, + "step": 2496 + }, + { + "epoch": 0.2742148034263123, + "grad_norm": 2.2684922218322754, + "learning_rate": 1e-06, + "loss": 0.991, + "mean_token_accuracy": 0.7130217552185059, + "num_tokens": 63564649.0, + "step": 2497 + }, + { + "epoch": 0.27432462112892597, + "grad_norm": 2.0409865379333496, + "learning_rate": 1e-06, + "loss": 1.0049, + "mean_token_accuracy": 0.7017092704772949, + "num_tokens": 63595740.0, + "step": 2498 + }, + { + "epoch": 0.2744344388315396, + "grad_norm": 2.269683599472046, + "learning_rate": 1e-06, + "loss": 1.0411, + "mean_token_accuracy": 0.6896746158599854, + "num_tokens": 63623748.0, + "step": 2499 + }, + { + "epoch": 0.2745442565341533, + "grad_norm": 2.3176894187927246, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7064863443374634, + "num_tokens": 63648099.0, + "step": 2500 + }, + { + "epoch": 0.27465407423676697, + "grad_norm": 2.3611674308776855, + "learning_rate": 1e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.7046754360198975, + "num_tokens": 63673796.0, + "step": 2501 + }, + { + "epoch": 0.2747638919393806, + "grad_norm": 2.5548532009124756, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7179268598556519, + "num_tokens": 63693175.0, + "step": 2502 + }, + { + "epoch": 0.2748737096419943, + "grad_norm": 2.475742816925049, + "learning_rate": 1e-06, + "loss": 1.0745, + "mean_token_accuracy": 0.6829231977462769, + "num_tokens": 63716594.0, + "step": 2503 + }, + { + "epoch": 0.27498352734460796, + "grad_norm": 2.1798880100250244, + "learning_rate": 1e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.6922471523284912, + "num_tokens": 63743976.0, + "step": 2504 + }, + { + "epoch": 0.2750933450472216, + "grad_norm": 2.1710145473480225, + "learning_rate": 1e-06, + "loss": 0.9975, + "mean_token_accuracy": 0.7027654051780701, + "num_tokens": 63769759.0, + "step": 2505 + }, + { + "epoch": 0.27520316274983525, + "grad_norm": 2.2549660205841064, + "learning_rate": 1e-06, + "loss": 1.0084, + "mean_token_accuracy": 0.7024697661399841, + "num_tokens": 63797923.0, + "step": 2506 + }, + { + "epoch": 0.27531298045244895, + "grad_norm": 2.3159031867980957, + "learning_rate": 1e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.7031850218772888, + "num_tokens": 63822284.0, + "step": 2507 + }, + { + "epoch": 0.2754227981550626, + "grad_norm": 2.5675220489501953, + "learning_rate": 1e-06, + "loss": 1.0259, + "mean_token_accuracy": 0.6981554627418518, + "num_tokens": 63842964.0, + "step": 2508 + }, + { + "epoch": 0.27553261585767624, + "grad_norm": 2.2914578914642334, + "learning_rate": 1e-06, + "loss": 0.9787, + "mean_token_accuracy": 0.6975311040878296, + "num_tokens": 63867761.0, + "step": 2509 + }, + { + "epoch": 0.27564243356028995, + "grad_norm": 2.433727264404297, + "learning_rate": 1e-06, + "loss": 1.0859, + "mean_token_accuracy": 0.6715764999389648, + "num_tokens": 63889465.0, + "step": 2510 + }, + { + "epoch": 0.2757522512629036, + "grad_norm": 1.9459009170532227, + "learning_rate": 1e-06, + "loss": 1.059, + "mean_token_accuracy": 0.6809753179550171, + "num_tokens": 63921151.0, + "step": 2511 + }, + { + "epoch": 0.27586206896551724, + "grad_norm": 2.3132822513580322, + "learning_rate": 1e-06, + "loss": 0.9939, + "mean_token_accuracy": 0.6970739960670471, + "num_tokens": 63945449.0, + "step": 2512 + }, + { + "epoch": 0.2759718866681309, + "grad_norm": 2.32735276222229, + "learning_rate": 1e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.7039940357208252, + "num_tokens": 63970137.0, + "step": 2513 + }, + { + "epoch": 0.2760817043707446, + "grad_norm": 2.4627082347869873, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.7024988532066345, + "num_tokens": 63993762.0, + "step": 2514 + }, + { + "epoch": 0.27619152207335823, + "grad_norm": 2.5801584720611572, + "learning_rate": 1e-06, + "loss": 0.9027, + "mean_token_accuracy": 0.7223547697067261, + "num_tokens": 64014658.0, + "step": 2515 + }, + { + "epoch": 0.2763013397759719, + "grad_norm": 2.2481322288513184, + "learning_rate": 1e-06, + "loss": 1.0572, + "mean_token_accuracy": 0.692253589630127, + "num_tokens": 64041600.0, + "step": 2516 + }, + { + "epoch": 0.2764111574785855, + "grad_norm": 2.5134201049804688, + "learning_rate": 1e-06, + "loss": 1.0436, + "mean_token_accuracy": 0.6885645389556885, + "num_tokens": 64063738.0, + "step": 2517 + }, + { + "epoch": 0.2765209751811992, + "grad_norm": 1.996491551399231, + "learning_rate": 1e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.6995278000831604, + "num_tokens": 64097933.0, + "step": 2518 + }, + { + "epoch": 0.27663079288381287, + "grad_norm": 2.3194353580474854, + "learning_rate": 1e-06, + "loss": 0.9766, + "mean_token_accuracy": 0.7017358541488647, + "num_tokens": 64123319.0, + "step": 2519 + }, + { + "epoch": 0.2767406105864265, + "grad_norm": 2.1929235458374023, + "learning_rate": 1e-06, + "loss": 1.0423, + "mean_token_accuracy": 0.6897413730621338, + "num_tokens": 64151549.0, + "step": 2520 + }, + { + "epoch": 0.2768504282890402, + "grad_norm": 2.341080665588379, + "learning_rate": 1e-06, + "loss": 1.0345, + "mean_token_accuracy": 0.6950242519378662, + "num_tokens": 64175701.0, + "step": 2521 + }, + { + "epoch": 0.27696024599165386, + "grad_norm": 2.3993186950683594, + "learning_rate": 1e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7167258262634277, + "num_tokens": 64198081.0, + "step": 2522 + }, + { + "epoch": 0.2770700636942675, + "grad_norm": 2.1435701847076416, + "learning_rate": 1e-06, + "loss": 1.024, + "mean_token_accuracy": 0.6957110166549683, + "num_tokens": 64226416.0, + "step": 2523 + }, + { + "epoch": 0.27717988139688116, + "grad_norm": 2.5124876499176025, + "learning_rate": 1e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.7053935527801514, + "num_tokens": 64247183.0, + "step": 2524 + }, + { + "epoch": 0.27728969909949486, + "grad_norm": 2.292048931121826, + "learning_rate": 1e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.7085893154144287, + "num_tokens": 64271076.0, + "step": 2525 + }, + { + "epoch": 0.2773995168021085, + "grad_norm": 2.5981197357177734, + "learning_rate": 1e-06, + "loss": 0.9953, + "mean_token_accuracy": 0.6979926824569702, + "num_tokens": 64290122.0, + "step": 2526 + }, + { + "epoch": 0.27750933450472215, + "grad_norm": 2.0606536865234375, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.6981199979782104, + "num_tokens": 64322022.0, + "step": 2527 + }, + { + "epoch": 0.2776191522073358, + "grad_norm": 2.0737602710723877, + "learning_rate": 1e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.7007961869239807, + "num_tokens": 64351773.0, + "step": 2528 + }, + { + "epoch": 0.2777289699099495, + "grad_norm": 2.1569371223449707, + "learning_rate": 1e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.7044709324836731, + "num_tokens": 64380128.0, + "step": 2529 + }, + { + "epoch": 0.27783878761256314, + "grad_norm": 2.1472859382629395, + "learning_rate": 1e-06, + "loss": 0.9746, + "mean_token_accuracy": 0.712478756904602, + "num_tokens": 64405473.0, + "step": 2530 + }, + { + "epoch": 0.2779486053151768, + "grad_norm": 2.55047607421875, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.7131686210632324, + "num_tokens": 64424505.0, + "step": 2531 + }, + { + "epoch": 0.2780584230177905, + "grad_norm": 2.164182424545288, + "learning_rate": 1e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.7031548023223877, + "num_tokens": 64450420.0, + "step": 2532 + }, + { + "epoch": 0.27816824072040414, + "grad_norm": 2.3713767528533936, + "learning_rate": 1e-06, + "loss": 1.1079, + "mean_token_accuracy": 0.66472327709198, + "num_tokens": 64475653.0, + "step": 2533 + }, + { + "epoch": 0.2782780584230178, + "grad_norm": 2.601266384124756, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.7060997486114502, + "num_tokens": 64497999.0, + "step": 2534 + }, + { + "epoch": 0.27838787612563143, + "grad_norm": 2.075009346008301, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7257330417633057, + "num_tokens": 64525201.0, + "step": 2535 + }, + { + "epoch": 0.27849769382824513, + "grad_norm": 2.187488079071045, + "learning_rate": 1e-06, + "loss": 0.9128, + "mean_token_accuracy": 0.7205244898796082, + "num_tokens": 64549965.0, + "step": 2536 + }, + { + "epoch": 0.2786075115308588, + "grad_norm": 2.3843390941619873, + "learning_rate": 1e-06, + "loss": 0.9868, + "mean_token_accuracy": 0.706779420375824, + "num_tokens": 64571253.0, + "step": 2537 + }, + { + "epoch": 0.2787173292334724, + "grad_norm": 2.337128162384033, + "learning_rate": 1e-06, + "loss": 1.011, + "mean_token_accuracy": 0.7027736902236938, + "num_tokens": 64595778.0, + "step": 2538 + }, + { + "epoch": 0.2788271469360861, + "grad_norm": 2.002742290496826, + "learning_rate": 1e-06, + "loss": 1.0211, + "mean_token_accuracy": 0.6888407468795776, + "num_tokens": 64627700.0, + "step": 2539 + }, + { + "epoch": 0.27893696463869977, + "grad_norm": 2.5093250274658203, + "learning_rate": 1e-06, + "loss": 1.0268, + "mean_token_accuracy": 0.6959774494171143, + "num_tokens": 64650805.0, + "step": 2540 + }, + { + "epoch": 0.2790467823413134, + "grad_norm": 2.0752980709075928, + "learning_rate": 1e-06, + "loss": 1.0735, + "mean_token_accuracy": 0.6861924529075623, + "num_tokens": 64680905.0, + "step": 2541 + }, + { + "epoch": 0.27915660004392706, + "grad_norm": 2.3584794998168945, + "learning_rate": 1e-06, + "loss": 1.0309, + "mean_token_accuracy": 0.6830258965492249, + "num_tokens": 64704878.0, + "step": 2542 + }, + { + "epoch": 0.27926641774654076, + "grad_norm": 2.090268135070801, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.7087658643722534, + "num_tokens": 64732273.0, + "step": 2543 + }, + { + "epoch": 0.2793762354491544, + "grad_norm": 2.4045746326446533, + "learning_rate": 1e-06, + "loss": 1.026, + "mean_token_accuracy": 0.6893519163131714, + "num_tokens": 64755569.0, + "step": 2544 + }, + { + "epoch": 0.27948605315176805, + "grad_norm": 2.100191116333008, + "learning_rate": 1e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.6980112195014954, + "num_tokens": 64784322.0, + "step": 2545 + }, + { + "epoch": 0.2795958708543817, + "grad_norm": 2.4524827003479004, + "learning_rate": 1e-06, + "loss": 1.0637, + "mean_token_accuracy": 0.6795852184295654, + "num_tokens": 64805994.0, + "step": 2546 + }, + { + "epoch": 0.2797056885569954, + "grad_norm": 2.3643343448638916, + "learning_rate": 1e-06, + "loss": 1.043, + "mean_token_accuracy": 0.6864386796951294, + "num_tokens": 64829433.0, + "step": 2547 + }, + { + "epoch": 0.27981550625960905, + "grad_norm": 2.1435563564300537, + "learning_rate": 1e-06, + "loss": 1.1119, + "mean_token_accuracy": 0.6677017211914062, + "num_tokens": 64859807.0, + "step": 2548 + }, + { + "epoch": 0.2799253239622227, + "grad_norm": 2.2638797760009766, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.7083075046539307, + "num_tokens": 64885744.0, + "step": 2549 + }, + { + "epoch": 0.2800351416648364, + "grad_norm": 2.435436964035034, + "learning_rate": 1e-06, + "loss": 1.0273, + "mean_token_accuracy": 0.7042014002799988, + "num_tokens": 64909228.0, + "step": 2550 + }, + { + "epoch": 0.28014495936745004, + "grad_norm": 2.589022636413574, + "learning_rate": 1e-06, + "loss": 1.0069, + "mean_token_accuracy": 0.6910866498947144, + "num_tokens": 64929464.0, + "step": 2551 + }, + { + "epoch": 0.2802547770700637, + "grad_norm": 2.3591837882995605, + "learning_rate": 1e-06, + "loss": 1.0223, + "mean_token_accuracy": 0.6940808296203613, + "num_tokens": 64952840.0, + "step": 2552 + }, + { + "epoch": 0.28036459477267733, + "grad_norm": 2.139770030975342, + "learning_rate": 1e-06, + "loss": 1.1048, + "mean_token_accuracy": 0.6714903116226196, + "num_tokens": 64981819.0, + "step": 2553 + }, + { + "epoch": 0.28047441247529104, + "grad_norm": 2.0285916328430176, + "learning_rate": 1e-06, + "loss": 1.0379, + "mean_token_accuracy": 0.6870213150978088, + "num_tokens": 65012681.0, + "step": 2554 + }, + { + "epoch": 0.2805842301779047, + "grad_norm": 2.3418567180633545, + "learning_rate": 1e-06, + "loss": 0.989, + "mean_token_accuracy": 0.7086079716682434, + "num_tokens": 65035678.0, + "step": 2555 + }, + { + "epoch": 0.2806940478805183, + "grad_norm": 2.409454584121704, + "learning_rate": 1e-06, + "loss": 1.0269, + "mean_token_accuracy": 0.6921095848083496, + "num_tokens": 65058187.0, + "step": 2556 + }, + { + "epoch": 0.28080386558313203, + "grad_norm": 2.1277012825012207, + "learning_rate": 1e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.7013957500457764, + "num_tokens": 65086699.0, + "step": 2557 + }, + { + "epoch": 0.2809136832857457, + "grad_norm": 2.470862627029419, + "learning_rate": 1e-06, + "loss": 1.0811, + "mean_token_accuracy": 0.6744886636734009, + "num_tokens": 65109618.0, + "step": 2558 + }, + { + "epoch": 0.2810235009883593, + "grad_norm": 2.198556661605835, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7145645618438721, + "num_tokens": 65134084.0, + "step": 2559 + }, + { + "epoch": 0.28113331869097297, + "grad_norm": 2.4175026416778564, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7086822390556335, + "num_tokens": 65156365.0, + "step": 2560 + }, + { + "epoch": 0.28124313639358667, + "grad_norm": 2.169071674346924, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.7055485248565674, + "num_tokens": 65181622.0, + "step": 2561 + }, + { + "epoch": 0.2813529540962003, + "grad_norm": 2.1579134464263916, + "learning_rate": 1e-06, + "loss": 1.0673, + "mean_token_accuracy": 0.6814802885055542, + "num_tokens": 65208727.0, + "step": 2562 + }, + { + "epoch": 0.28146277179881396, + "grad_norm": 2.6937639713287354, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.7108420133590698, + "num_tokens": 65227874.0, + "step": 2563 + }, + { + "epoch": 0.2815725895014276, + "grad_norm": 2.1692867279052734, + "learning_rate": 1e-06, + "loss": 1.0646, + "mean_token_accuracy": 0.6791213154792786, + "num_tokens": 65257561.0, + "step": 2564 + }, + { + "epoch": 0.2816824072040413, + "grad_norm": 2.290290117263794, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7114814519882202, + "num_tokens": 65281117.0, + "step": 2565 + }, + { + "epoch": 0.28179222490665495, + "grad_norm": 2.4164421558380127, + "learning_rate": 1e-06, + "loss": 1.0007, + "mean_token_accuracy": 0.7066359519958496, + "num_tokens": 65302345.0, + "step": 2566 + }, + { + "epoch": 0.2819020426092686, + "grad_norm": 2.4365196228027344, + "learning_rate": 1e-06, + "loss": 1.0857, + "mean_token_accuracy": 0.680709958076477, + "num_tokens": 65324745.0, + "step": 2567 + }, + { + "epoch": 0.2820118603118823, + "grad_norm": 1.9649522304534912, + "learning_rate": 1e-06, + "loss": 1.0179, + "mean_token_accuracy": 0.6899946928024292, + "num_tokens": 65354878.0, + "step": 2568 + }, + { + "epoch": 0.28212167801449595, + "grad_norm": 2.523524045944214, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7152289152145386, + "num_tokens": 65375797.0, + "step": 2569 + }, + { + "epoch": 0.2822314957171096, + "grad_norm": 2.2764732837677, + "learning_rate": 1e-06, + "loss": 1.0358, + "mean_token_accuracy": 0.6965872049331665, + "num_tokens": 65400273.0, + "step": 2570 + }, + { + "epoch": 0.28234131341972324, + "grad_norm": 2.2817342281341553, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.7064021825790405, + "num_tokens": 65424409.0, + "step": 2571 + }, + { + "epoch": 0.28245113112233694, + "grad_norm": 2.3987958431243896, + "learning_rate": 1e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.7018574476242065, + "num_tokens": 65446833.0, + "step": 2572 + }, + { + "epoch": 0.2825609488249506, + "grad_norm": 2.218193531036377, + "learning_rate": 1e-06, + "loss": 0.864, + "mean_token_accuracy": 0.7294540405273438, + "num_tokens": 65469726.0, + "step": 2573 + }, + { + "epoch": 0.28267076652756423, + "grad_norm": 2.584383010864258, + "learning_rate": 1e-06, + "loss": 1.0478, + "mean_token_accuracy": 0.6844720840454102, + "num_tokens": 65491830.0, + "step": 2574 + }, + { + "epoch": 0.2827805842301779, + "grad_norm": 2.165972948074341, + "learning_rate": 1e-06, + "loss": 1.0176, + "mean_token_accuracy": 0.6904040575027466, + "num_tokens": 65517452.0, + "step": 2575 + }, + { + "epoch": 0.2828904019327916, + "grad_norm": 2.450848340988159, + "learning_rate": 1e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.702778697013855, + "num_tokens": 65540743.0, + "step": 2576 + }, + { + "epoch": 0.2830002196354052, + "grad_norm": 2.439112663269043, + "learning_rate": 1e-06, + "loss": 0.9803, + "mean_token_accuracy": 0.7086199522018433, + "num_tokens": 65562860.0, + "step": 2577 + }, + { + "epoch": 0.28311003733801887, + "grad_norm": 2.184840202331543, + "learning_rate": 1e-06, + "loss": 1.0172, + "mean_token_accuracy": 0.7021857500076294, + "num_tokens": 65590094.0, + "step": 2578 + }, + { + "epoch": 0.2832198550406326, + "grad_norm": 2.407940149307251, + "learning_rate": 1e-06, + "loss": 0.9736, + "mean_token_accuracy": 0.7046197056770325, + "num_tokens": 65614119.0, + "step": 2579 + }, + { + "epoch": 0.2833296727432462, + "grad_norm": 2.2967140674591064, + "learning_rate": 1e-06, + "loss": 1.0583, + "mean_token_accuracy": 0.6860026121139526, + "num_tokens": 65641223.0, + "step": 2580 + }, + { + "epoch": 0.28343949044585987, + "grad_norm": 2.0799126625061035, + "learning_rate": 1e-06, + "loss": 1.0605, + "mean_token_accuracy": 0.6834745407104492, + "num_tokens": 65670390.0, + "step": 2581 + }, + { + "epoch": 0.2835493081484735, + "grad_norm": 2.123124837875366, + "learning_rate": 1e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7080515623092651, + "num_tokens": 65700027.0, + "step": 2582 + }, + { + "epoch": 0.2836591258510872, + "grad_norm": 2.335427761077881, + "learning_rate": 1e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7172180414199829, + "num_tokens": 65722985.0, + "step": 2583 + }, + { + "epoch": 0.28376894355370086, + "grad_norm": 2.099255323410034, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.7055531144142151, + "num_tokens": 65750318.0, + "step": 2584 + }, + { + "epoch": 0.2838787612563145, + "grad_norm": 2.759530782699585, + "learning_rate": 1e-06, + "loss": 0.995, + "mean_token_accuracy": 0.6933009624481201, + "num_tokens": 65771072.0, + "step": 2585 + }, + { + "epoch": 0.2839885789589282, + "grad_norm": 2.17952823638916, + "learning_rate": 1e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.7134116888046265, + "num_tokens": 65799153.0, + "step": 2586 + }, + { + "epoch": 0.28409839666154185, + "grad_norm": 1.8534777164459229, + "learning_rate": 1e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7100166082382202, + "num_tokens": 65832950.0, + "step": 2587 + }, + { + "epoch": 0.2842082143641555, + "grad_norm": 2.3148250579833984, + "learning_rate": 1e-06, + "loss": 1.0194, + "mean_token_accuracy": 0.69796222448349, + "num_tokens": 65857281.0, + "step": 2588 + }, + { + "epoch": 0.28431803206676914, + "grad_norm": 2.42228102684021, + "learning_rate": 1e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.7011666893959045, + "num_tokens": 65879360.0, + "step": 2589 + }, + { + "epoch": 0.28442784976938285, + "grad_norm": 2.090435266494751, + "learning_rate": 1e-06, + "loss": 1.0062, + "mean_token_accuracy": 0.7005859613418579, + "num_tokens": 65909562.0, + "step": 2590 + }, + { + "epoch": 0.2845376674719965, + "grad_norm": 2.216914176940918, + "learning_rate": 1e-06, + "loss": 1.0099, + "mean_token_accuracy": 0.6986279487609863, + "num_tokens": 65934667.0, + "step": 2591 + }, + { + "epoch": 0.28464748517461014, + "grad_norm": 2.208350658416748, + "learning_rate": 1e-06, + "loss": 1.0556, + "mean_token_accuracy": 0.6846886873245239, + "num_tokens": 65960937.0, + "step": 2592 + }, + { + "epoch": 0.2847573028772238, + "grad_norm": 2.238337278366089, + "learning_rate": 1e-06, + "loss": 1.096, + "mean_token_accuracy": 0.6745561957359314, + "num_tokens": 65990527.0, + "step": 2593 + }, + { + "epoch": 0.2848671205798375, + "grad_norm": 2.335026502609253, + "learning_rate": 1e-06, + "loss": 0.9979, + "mean_token_accuracy": 0.6972161531448364, + "num_tokens": 66013787.0, + "step": 2594 + }, + { + "epoch": 0.28497693828245113, + "grad_norm": 2.386124849319458, + "learning_rate": 1e-06, + "loss": 1.0479, + "mean_token_accuracy": 0.6933366060256958, + "num_tokens": 66037718.0, + "step": 2595 + }, + { + "epoch": 0.2850867559850648, + "grad_norm": 2.3278119564056396, + "learning_rate": 1e-06, + "loss": 1.0244, + "mean_token_accuracy": 0.6944559812545776, + "num_tokens": 66062260.0, + "step": 2596 + }, + { + "epoch": 0.2851965736876785, + "grad_norm": 2.26108455657959, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.7131069898605347, + "num_tokens": 66087529.0, + "step": 2597 + }, + { + "epoch": 0.2853063913902921, + "grad_norm": 2.115830898284912, + "learning_rate": 1e-06, + "loss": 1.0, + "mean_token_accuracy": 0.6990344524383545, + "num_tokens": 66115676.0, + "step": 2598 + }, + { + "epoch": 0.28541620909290577, + "grad_norm": 2.3521766662597656, + "learning_rate": 1e-06, + "loss": 0.9349, + "mean_token_accuracy": 0.7132381796836853, + "num_tokens": 66139290.0, + "step": 2599 + }, + { + "epoch": 0.2855260267955194, + "grad_norm": 2.1870014667510986, + "learning_rate": 1e-06, + "loss": 1.0295, + "mean_token_accuracy": 0.6898337602615356, + "num_tokens": 66167400.0, + "step": 2600 + }, + { + "epoch": 0.2856358444981331, + "grad_norm": 2.1657633781433105, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.7097232937812805, + "num_tokens": 66193573.0, + "step": 2601 + }, + { + "epoch": 0.28574566220074676, + "grad_norm": 2.2044219970703125, + "learning_rate": 1e-06, + "loss": 0.9972, + "mean_token_accuracy": 0.7023631930351257, + "num_tokens": 66218146.0, + "step": 2602 + }, + { + "epoch": 0.2858554799033604, + "grad_norm": 2.3573033809661865, + "learning_rate": 1e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.7220429182052612, + "num_tokens": 66241755.0, + "step": 2603 + }, + { + "epoch": 0.28596529760597406, + "grad_norm": 2.4583792686462402, + "learning_rate": 1e-06, + "loss": 1.0955, + "mean_token_accuracy": 0.6774742007255554, + "num_tokens": 66264368.0, + "step": 2604 + }, + { + "epoch": 0.28607511530858776, + "grad_norm": 2.2136619091033936, + "learning_rate": 1e-06, + "loss": 1.0213, + "mean_token_accuracy": 0.7016068696975708, + "num_tokens": 66292148.0, + "step": 2605 + }, + { + "epoch": 0.2861849330112014, + "grad_norm": 2.239851713180542, + "learning_rate": 1e-06, + "loss": 0.959, + "mean_token_accuracy": 0.7053545713424683, + "num_tokens": 66317677.0, + "step": 2606 + }, + { + "epoch": 0.28629475071381505, + "grad_norm": 2.2509143352508545, + "learning_rate": 1e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.7109248638153076, + "num_tokens": 66340770.0, + "step": 2607 + }, + { + "epoch": 0.28640456841642875, + "grad_norm": 2.2937798500061035, + "learning_rate": 1e-06, + "loss": 1.0049, + "mean_token_accuracy": 0.6912655830383301, + "num_tokens": 66365800.0, + "step": 2608 + }, + { + "epoch": 0.2865143861190424, + "grad_norm": 2.4126510620117188, + "learning_rate": 1e-06, + "loss": 0.8961, + "mean_token_accuracy": 0.7194973230361938, + "num_tokens": 66387281.0, + "step": 2609 + }, + { + "epoch": 0.28662420382165604, + "grad_norm": 2.507000207901001, + "learning_rate": 1e-06, + "loss": 0.9553, + "mean_token_accuracy": 0.7074236273765564, + "num_tokens": 66408183.0, + "step": 2610 + }, + { + "epoch": 0.2867340215242697, + "grad_norm": 2.1377437114715576, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.6944173574447632, + "num_tokens": 66436631.0, + "step": 2611 + }, + { + "epoch": 0.2868438392268834, + "grad_norm": 2.0270495414733887, + "learning_rate": 1e-06, + "loss": 0.9588, + "mean_token_accuracy": 0.7066906094551086, + "num_tokens": 66465499.0, + "step": 2612 + }, + { + "epoch": 0.28695365692949704, + "grad_norm": 2.533064603805542, + "learning_rate": 1e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.7088136672973633, + "num_tokens": 66486648.0, + "step": 2613 + }, + { + "epoch": 0.2870634746321107, + "grad_norm": 2.3992199897766113, + "learning_rate": 1e-06, + "loss": 1.039, + "mean_token_accuracy": 0.6878204345703125, + "num_tokens": 66510236.0, + "step": 2614 + }, + { + "epoch": 0.2871732923347244, + "grad_norm": 2.3452975749969482, + "learning_rate": 1e-06, + "loss": 0.9851, + "mean_token_accuracy": 0.7020933628082275, + "num_tokens": 66534984.0, + "step": 2615 + }, + { + "epoch": 0.28728311003733803, + "grad_norm": 2.34279727935791, + "learning_rate": 1e-06, + "loss": 1.099, + "mean_token_accuracy": 0.6795903444290161, + "num_tokens": 66560005.0, + "step": 2616 + }, + { + "epoch": 0.2873929277399517, + "grad_norm": 2.5495200157165527, + "learning_rate": 1e-06, + "loss": 1.0351, + "mean_token_accuracy": 0.6902906894683838, + "num_tokens": 66580079.0, + "step": 2617 + }, + { + "epoch": 0.2875027454425653, + "grad_norm": 2.4788708686828613, + "learning_rate": 1e-06, + "loss": 1.0019, + "mean_token_accuracy": 0.6902787685394287, + "num_tokens": 66601586.0, + "step": 2618 + }, + { + "epoch": 0.287612563145179, + "grad_norm": 2.5792088508605957, + "learning_rate": 1e-06, + "loss": 0.9983, + "mean_token_accuracy": 0.6945852637290955, + "num_tokens": 66624268.0, + "step": 2619 + }, + { + "epoch": 0.28772238084779267, + "grad_norm": 2.2241644859313965, + "learning_rate": 1e-06, + "loss": 1.038, + "mean_token_accuracy": 0.6857668161392212, + "num_tokens": 66649093.0, + "step": 2620 + }, + { + "epoch": 0.2878321985504063, + "grad_norm": 2.384701728820801, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.7186157703399658, + "num_tokens": 66671369.0, + "step": 2621 + }, + { + "epoch": 0.28794201625301996, + "grad_norm": 2.4088315963745117, + "learning_rate": 1e-06, + "loss": 1.0911, + "mean_token_accuracy": 0.6794006824493408, + "num_tokens": 66697486.0, + "step": 2622 + }, + { + "epoch": 0.28805183395563366, + "grad_norm": 2.3829691410064697, + "learning_rate": 1e-06, + "loss": 0.9085, + "mean_token_accuracy": 0.7271515727043152, + "num_tokens": 66720219.0, + "step": 2623 + }, + { + "epoch": 0.2881616516582473, + "grad_norm": 2.5097403526306152, + "learning_rate": 1e-06, + "loss": 1.0338, + "mean_token_accuracy": 0.6926029920578003, + "num_tokens": 66743252.0, + "step": 2624 + }, + { + "epoch": 0.28827146936086095, + "grad_norm": 2.0945847034454346, + "learning_rate": 1e-06, + "loss": 0.9833, + "mean_token_accuracy": 0.7024261951446533, + "num_tokens": 66770763.0, + "step": 2625 + }, + { + "epoch": 0.28838128706347466, + "grad_norm": 2.2669878005981445, + "learning_rate": 1e-06, + "loss": 1.0237, + "mean_token_accuracy": 0.6923201084136963, + "num_tokens": 66796753.0, + "step": 2626 + }, + { + "epoch": 0.2884911047660883, + "grad_norm": 2.3971359729766846, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.7072960734367371, + "num_tokens": 66819853.0, + "step": 2627 + }, + { + "epoch": 0.28860092246870195, + "grad_norm": 2.208875894546509, + "learning_rate": 1e-06, + "loss": 1.0069, + "mean_token_accuracy": 0.7048642635345459, + "num_tokens": 66845251.0, + "step": 2628 + }, + { + "epoch": 0.2887107401713156, + "grad_norm": 2.1409194469451904, + "learning_rate": 1e-06, + "loss": 1.0512, + "mean_token_accuracy": 0.6839014291763306, + "num_tokens": 66873388.0, + "step": 2629 + }, + { + "epoch": 0.2888205578739293, + "grad_norm": 2.489973545074463, + "learning_rate": 1e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.7151739001274109, + "num_tokens": 66893723.0, + "step": 2630 + }, + { + "epoch": 0.28893037557654294, + "grad_norm": 2.2347326278686523, + "learning_rate": 1e-06, + "loss": 1.0433, + "mean_token_accuracy": 0.6812988519668579, + "num_tokens": 66919860.0, + "step": 2631 + }, + { + "epoch": 0.2890401932791566, + "grad_norm": 2.280618190765381, + "learning_rate": 1e-06, + "loss": 1.0189, + "mean_token_accuracy": 0.6960492134094238, + "num_tokens": 66946965.0, + "step": 2632 + }, + { + "epoch": 0.2891500109817703, + "grad_norm": 2.247545003890991, + "learning_rate": 1e-06, + "loss": 1.1089, + "mean_token_accuracy": 0.6759402751922607, + "num_tokens": 66974851.0, + "step": 2633 + }, + { + "epoch": 0.28925982868438394, + "grad_norm": 2.06097149848938, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.699852705001831, + "num_tokens": 67005644.0, + "step": 2634 + }, + { + "epoch": 0.2893696463869976, + "grad_norm": 2.268739938735962, + "learning_rate": 1e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.6986406445503235, + "num_tokens": 67031030.0, + "step": 2635 + }, + { + "epoch": 0.2894794640896112, + "grad_norm": 2.3483152389526367, + "learning_rate": 1e-06, + "loss": 1.0142, + "mean_token_accuracy": 0.6970677971839905, + "num_tokens": 67054058.0, + "step": 2636 + }, + { + "epoch": 0.28958928179222493, + "grad_norm": 2.356259822845459, + "learning_rate": 1e-06, + "loss": 1.039, + "mean_token_accuracy": 0.6994339227676392, + "num_tokens": 67078253.0, + "step": 2637 + }, + { + "epoch": 0.2896990994948386, + "grad_norm": 2.373002290725708, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.7014904618263245, + "num_tokens": 67101845.0, + "step": 2638 + }, + { + "epoch": 0.2898089171974522, + "grad_norm": 2.2194018363952637, + "learning_rate": 1e-06, + "loss": 1.0747, + "mean_token_accuracy": 0.6795980334281921, + "num_tokens": 67129631.0, + "step": 2639 + }, + { + "epoch": 0.28991873490006587, + "grad_norm": 2.254500389099121, + "learning_rate": 1e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.7017685174942017, + "num_tokens": 67155606.0, + "step": 2640 + }, + { + "epoch": 0.29002855260267957, + "grad_norm": 2.19834566116333, + "learning_rate": 1e-06, + "loss": 0.9847, + "mean_token_accuracy": 0.7011922597885132, + "num_tokens": 67180967.0, + "step": 2641 + }, + { + "epoch": 0.2901383703052932, + "grad_norm": 1.9222017526626587, + "learning_rate": 1e-06, + "loss": 0.9368, + "mean_token_accuracy": 0.7212226390838623, + "num_tokens": 67213239.0, + "step": 2642 + }, + { + "epoch": 0.29024818800790686, + "grad_norm": 2.211224317550659, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7198362350463867, + "num_tokens": 67238402.0, + "step": 2643 + }, + { + "epoch": 0.29035800571052056, + "grad_norm": 2.3497962951660156, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7108926773071289, + "num_tokens": 67260563.0, + "step": 2644 + }, + { + "epoch": 0.2904678234131342, + "grad_norm": 2.2038707733154297, + "learning_rate": 1e-06, + "loss": 1.0144, + "mean_token_accuracy": 0.6941711902618408, + "num_tokens": 67287933.0, + "step": 2645 + }, + { + "epoch": 0.29057764111574785, + "grad_norm": 2.3198373317718506, + "learning_rate": 1e-06, + "loss": 1.0728, + "mean_token_accuracy": 0.6824098229408264, + "num_tokens": 67311593.0, + "step": 2646 + }, + { + "epoch": 0.2906874588183615, + "grad_norm": 2.056636095046997, + "learning_rate": 1e-06, + "loss": 1.0638, + "mean_token_accuracy": 0.687256932258606, + "num_tokens": 67342736.0, + "step": 2647 + }, + { + "epoch": 0.2907972765209752, + "grad_norm": 2.1365623474121094, + "learning_rate": 1e-06, + "loss": 0.953, + "mean_token_accuracy": 0.7084842920303345, + "num_tokens": 67368564.0, + "step": 2648 + }, + { + "epoch": 0.29090709422358885, + "grad_norm": 4.365606307983398, + "learning_rate": 1e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.7067464590072632, + "num_tokens": 67389183.0, + "step": 2649 + }, + { + "epoch": 0.2910169119262025, + "grad_norm": 2.1169323921203613, + "learning_rate": 1e-06, + "loss": 1.0352, + "mean_token_accuracy": 0.6911256909370422, + "num_tokens": 67417072.0, + "step": 2650 + }, + { + "epoch": 0.29112672962881614, + "grad_norm": 2.3824379444122314, + "learning_rate": 1e-06, + "loss": 1.0453, + "mean_token_accuracy": 0.6887822151184082, + "num_tokens": 67442572.0, + "step": 2651 + }, + { + "epoch": 0.29123654733142984, + "grad_norm": 2.419835090637207, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.6999349594116211, + "num_tokens": 67466071.0, + "step": 2652 + }, + { + "epoch": 0.2913463650340435, + "grad_norm": 2.4099555015563965, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.7018195390701294, + "num_tokens": 67487753.0, + "step": 2653 + }, + { + "epoch": 0.29145618273665713, + "grad_norm": 2.555474042892456, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.7215985655784607, + "num_tokens": 67508116.0, + "step": 2654 + }, + { + "epoch": 0.29156600043927083, + "grad_norm": 2.148033380508423, + "learning_rate": 1e-06, + "loss": 0.973, + "mean_token_accuracy": 0.7032228708267212, + "num_tokens": 67534268.0, + "step": 2655 + }, + { + "epoch": 0.2916758181418845, + "grad_norm": 2.6186025142669678, + "learning_rate": 1e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.705680251121521, + "num_tokens": 67554477.0, + "step": 2656 + }, + { + "epoch": 0.2917856358444981, + "grad_norm": 2.289111375808716, + "learning_rate": 1e-06, + "loss": 1.0163, + "mean_token_accuracy": 0.6918329000473022, + "num_tokens": 67579870.0, + "step": 2657 + }, + { + "epoch": 0.29189545354711177, + "grad_norm": 2.3826096057891846, + "learning_rate": 1e-06, + "loss": 0.9943, + "mean_token_accuracy": 0.7014812231063843, + "num_tokens": 67601961.0, + "step": 2658 + }, + { + "epoch": 0.2920052712497255, + "grad_norm": 2.372610092163086, + "learning_rate": 1e-06, + "loss": 1.0012, + "mean_token_accuracy": 0.6961517333984375, + "num_tokens": 67624630.0, + "step": 2659 + }, + { + "epoch": 0.2921150889523391, + "grad_norm": 2.302293539047241, + "learning_rate": 1e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.7198699712753296, + "num_tokens": 67646971.0, + "step": 2660 + }, + { + "epoch": 0.29222490665495277, + "grad_norm": 2.202322244644165, + "learning_rate": 1e-06, + "loss": 1.0066, + "mean_token_accuracy": 0.6867067813873291, + "num_tokens": 67670398.0, + "step": 2661 + }, + { + "epoch": 0.29233472435756647, + "grad_norm": 2.2939655780792236, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7107180953025818, + "num_tokens": 67695372.0, + "step": 2662 + }, + { + "epoch": 0.2924445420601801, + "grad_norm": 2.323336124420166, + "learning_rate": 1e-06, + "loss": 1.002, + "mean_token_accuracy": 0.6912232637405396, + "num_tokens": 67719368.0, + "step": 2663 + }, + { + "epoch": 0.29255435976279376, + "grad_norm": 2.6868770122528076, + "learning_rate": 1e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.7245588302612305, + "num_tokens": 67744385.0, + "step": 2664 + }, + { + "epoch": 0.2926641774654074, + "grad_norm": 2.4617555141448975, + "learning_rate": 1e-06, + "loss": 1.0233, + "mean_token_accuracy": 0.6989826560020447, + "num_tokens": 67766612.0, + "step": 2665 + }, + { + "epoch": 0.2927739951680211, + "grad_norm": 2.4281508922576904, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7120162844657898, + "num_tokens": 67787881.0, + "step": 2666 + }, + { + "epoch": 0.29288381287063475, + "grad_norm": 2.0792791843414307, + "learning_rate": 1e-06, + "loss": 1.0516, + "mean_token_accuracy": 0.686279833316803, + "num_tokens": 67815925.0, + "step": 2667 + }, + { + "epoch": 0.2929936305732484, + "grad_norm": 2.1234171390533447, + "learning_rate": 1e-06, + "loss": 1.0572, + "mean_token_accuracy": 0.6912084817886353, + "num_tokens": 67844715.0, + "step": 2668 + }, + { + "epoch": 0.29310344827586204, + "grad_norm": 2.0310637950897217, + "learning_rate": 1e-06, + "loss": 1.0107, + "mean_token_accuracy": 0.6983997821807861, + "num_tokens": 67874447.0, + "step": 2669 + }, + { + "epoch": 0.29321326597847575, + "grad_norm": 2.727438449859619, + "learning_rate": 1e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.7091434597969055, + "num_tokens": 67892287.0, + "step": 2670 + }, + { + "epoch": 0.2933230836810894, + "grad_norm": 2.16953182220459, + "learning_rate": 1e-06, + "loss": 0.9736, + "mean_token_accuracy": 0.704412579536438, + "num_tokens": 67917779.0, + "step": 2671 + }, + { + "epoch": 0.29343290138370304, + "grad_norm": 2.0870301723480225, + "learning_rate": 1e-06, + "loss": 1.0491, + "mean_token_accuracy": 0.6873290538787842, + "num_tokens": 67946364.0, + "step": 2672 + }, + { + "epoch": 0.29354271908631674, + "grad_norm": 2.564323902130127, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.703453540802002, + "num_tokens": 67964720.0, + "step": 2673 + }, + { + "epoch": 0.2936525367889304, + "grad_norm": 2.1442372798919678, + "learning_rate": 1e-06, + "loss": 1.0048, + "mean_token_accuracy": 0.7006510496139526, + "num_tokens": 67992049.0, + "step": 2674 + }, + { + "epoch": 0.29376235449154403, + "grad_norm": 2.2906036376953125, + "learning_rate": 1e-06, + "loss": 1.1128, + "mean_token_accuracy": 0.6728315353393555, + "num_tokens": 68018610.0, + "step": 2675 + }, + { + "epoch": 0.2938721721941577, + "grad_norm": 2.3906962871551514, + "learning_rate": 1e-06, + "loss": 1.018, + "mean_token_accuracy": 0.6938323974609375, + "num_tokens": 68042108.0, + "step": 2676 + }, + { + "epoch": 0.2939819898967714, + "grad_norm": 2.350202798843384, + "learning_rate": 1e-06, + "loss": 1.0791, + "mean_token_accuracy": 0.6888242363929749, + "num_tokens": 68071310.0, + "step": 2677 + }, + { + "epoch": 0.294091807599385, + "grad_norm": 2.5612237453460693, + "learning_rate": 1e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.7052779197692871, + "num_tokens": 68091826.0, + "step": 2678 + }, + { + "epoch": 0.29420162530199867, + "grad_norm": 2.561591386795044, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7015293836593628, + "num_tokens": 68110605.0, + "step": 2679 + }, + { + "epoch": 0.2943114430046123, + "grad_norm": 2.832582473754883, + "learning_rate": 1e-06, + "loss": 1.013, + "mean_token_accuracy": 0.6957682967185974, + "num_tokens": 68128415.0, + "step": 2680 + }, + { + "epoch": 0.294421260707226, + "grad_norm": 2.1860668659210205, + "learning_rate": 1e-06, + "loss": 1.0357, + "mean_token_accuracy": 0.6892836093902588, + "num_tokens": 68156041.0, + "step": 2681 + }, + { + "epoch": 0.29453107840983966, + "grad_norm": 2.470672130584717, + "learning_rate": 1e-06, + "loss": 1.0248, + "mean_token_accuracy": 0.6913832426071167, + "num_tokens": 68175988.0, + "step": 2682 + }, + { + "epoch": 0.2946408961124533, + "grad_norm": 2.4249958992004395, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.7105304002761841, + "num_tokens": 68198649.0, + "step": 2683 + }, + { + "epoch": 0.294750713815067, + "grad_norm": 2.285658836364746, + "learning_rate": 1e-06, + "loss": 0.9951, + "mean_token_accuracy": 0.6952019929885864, + "num_tokens": 68222947.0, + "step": 2684 + }, + { + "epoch": 0.29486053151768066, + "grad_norm": 2.3064539432525635, + "learning_rate": 1e-06, + "loss": 1.0262, + "mean_token_accuracy": 0.6812918186187744, + "num_tokens": 68245499.0, + "step": 2685 + }, + { + "epoch": 0.2949703492202943, + "grad_norm": 2.3473358154296875, + "learning_rate": 1e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.7212908267974854, + "num_tokens": 68268270.0, + "step": 2686 + }, + { + "epoch": 0.29508016692290795, + "grad_norm": 2.4036855697631836, + "learning_rate": 1e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.7054141163825989, + "num_tokens": 68291400.0, + "step": 2687 + }, + { + "epoch": 0.29518998462552165, + "grad_norm": 2.2374913692474365, + "learning_rate": 1e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.6966949701309204, + "num_tokens": 68317071.0, + "step": 2688 + }, + { + "epoch": 0.2952998023281353, + "grad_norm": 2.1351330280303955, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.7023653984069824, + "num_tokens": 68343675.0, + "step": 2689 + }, + { + "epoch": 0.29540962003074894, + "grad_norm": 2.790314197540283, + "learning_rate": 1e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.695806086063385, + "num_tokens": 68361197.0, + "step": 2690 + }, + { + "epoch": 0.29551943773336264, + "grad_norm": 2.3360037803649902, + "learning_rate": 1e-06, + "loss": 1.048, + "mean_token_accuracy": 0.686822772026062, + "num_tokens": 68386602.0, + "step": 2691 + }, + { + "epoch": 0.2956292554359763, + "grad_norm": 2.182288646697998, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.7059739828109741, + "num_tokens": 68412385.0, + "step": 2692 + }, + { + "epoch": 0.29573907313858994, + "grad_norm": 2.1416919231414795, + "learning_rate": 1e-06, + "loss": 1.0561, + "mean_token_accuracy": 0.6830915212631226, + "num_tokens": 68441544.0, + "step": 2693 + }, + { + "epoch": 0.2958488908412036, + "grad_norm": 2.193253517150879, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.7002197504043579, + "num_tokens": 68468328.0, + "step": 2694 + }, + { + "epoch": 0.2959587085438173, + "grad_norm": 2.3618924617767334, + "learning_rate": 1e-06, + "loss": 1.0003, + "mean_token_accuracy": 0.70201176404953, + "num_tokens": 68491179.0, + "step": 2695 + }, + { + "epoch": 0.29606852624643093, + "grad_norm": 2.245945692062378, + "learning_rate": 1e-06, + "loss": 0.8895, + "mean_token_accuracy": 0.7306729555130005, + "num_tokens": 68516167.0, + "step": 2696 + }, + { + "epoch": 0.2961783439490446, + "grad_norm": 2.2458791732788086, + "learning_rate": 1e-06, + "loss": 1.0287, + "mean_token_accuracy": 0.6988657712936401, + "num_tokens": 68541439.0, + "step": 2697 + }, + { + "epoch": 0.2962881616516582, + "grad_norm": 2.060124397277832, + "learning_rate": 1e-06, + "loss": 1.0346, + "mean_token_accuracy": 0.6947042346000671, + "num_tokens": 68570460.0, + "step": 2698 + }, + { + "epoch": 0.2963979793542719, + "grad_norm": 2.618272542953491, + "learning_rate": 1e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.7042705416679382, + "num_tokens": 68591105.0, + "step": 2699 + }, + { + "epoch": 0.29650779705688557, + "grad_norm": 2.2146801948547363, + "learning_rate": 1e-06, + "loss": 1.0079, + "mean_token_accuracy": 0.6962378621101379, + "num_tokens": 68616242.0, + "step": 2700 + }, + { + "epoch": 0.2966176147594992, + "grad_norm": 2.1662330627441406, + "learning_rate": 1e-06, + "loss": 1.0457, + "mean_token_accuracy": 0.6862831115722656, + "num_tokens": 68643505.0, + "step": 2701 + }, + { + "epoch": 0.2967274324621129, + "grad_norm": 2.243788480758667, + "learning_rate": 1e-06, + "loss": 0.908, + "mean_token_accuracy": 0.7226523160934448, + "num_tokens": 68667375.0, + "step": 2702 + }, + { + "epoch": 0.29683725016472656, + "grad_norm": 3.1909942626953125, + "learning_rate": 1e-06, + "loss": 0.8905, + "mean_token_accuracy": 0.7285280227661133, + "num_tokens": 68681496.0, + "step": 2703 + }, + { + "epoch": 0.2969470678673402, + "grad_norm": 2.0839152336120605, + "learning_rate": 1e-06, + "loss": 0.881, + "mean_token_accuracy": 0.7215877771377563, + "num_tokens": 68706387.0, + "step": 2704 + }, + { + "epoch": 0.29705688556995385, + "grad_norm": 2.393500804901123, + "learning_rate": 1e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.693902850151062, + "num_tokens": 68730167.0, + "step": 2705 + }, + { + "epoch": 0.29716670327256756, + "grad_norm": 2.5993049144744873, + "learning_rate": 1e-06, + "loss": 1.035, + "mean_token_accuracy": 0.6882032155990601, + "num_tokens": 68750951.0, + "step": 2706 + }, + { + "epoch": 0.2972765209751812, + "grad_norm": 2.381462812423706, + "learning_rate": 1e-06, + "loss": 1.0316, + "mean_token_accuracy": 0.6951367855072021, + "num_tokens": 68774305.0, + "step": 2707 + }, + { + "epoch": 0.29738633867779485, + "grad_norm": 2.377025604248047, + "learning_rate": 1e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.7122269868850708, + "num_tokens": 68798494.0, + "step": 2708 + }, + { + "epoch": 0.29749615638040855, + "grad_norm": 2.4406137466430664, + "learning_rate": 1e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7120704054832458, + "num_tokens": 68819408.0, + "step": 2709 + }, + { + "epoch": 0.2976059740830222, + "grad_norm": 2.2205026149749756, + "learning_rate": 1e-06, + "loss": 1.039, + "mean_token_accuracy": 0.6868220567703247, + "num_tokens": 68846764.0, + "step": 2710 + }, + { + "epoch": 0.29771579178563584, + "grad_norm": 2.2095181941986084, + "learning_rate": 1e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.6957758665084839, + "num_tokens": 68874986.0, + "step": 2711 + }, + { + "epoch": 0.2978256094882495, + "grad_norm": 2.11307692527771, + "learning_rate": 1e-06, + "loss": 1.0304, + "mean_token_accuracy": 0.694669246673584, + "num_tokens": 68902682.0, + "step": 2712 + }, + { + "epoch": 0.2979354271908632, + "grad_norm": 2.6384565830230713, + "learning_rate": 1e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.7181487083435059, + "num_tokens": 68920596.0, + "step": 2713 + }, + { + "epoch": 0.29804524489347684, + "grad_norm": 2.470374822616577, + "learning_rate": 1e-06, + "loss": 1.0711, + "mean_token_accuracy": 0.6827289462089539, + "num_tokens": 68942859.0, + "step": 2714 + }, + { + "epoch": 0.2981550625960905, + "grad_norm": 2.1392765045166016, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.7058542966842651, + "num_tokens": 68969833.0, + "step": 2715 + }, + { + "epoch": 0.2982648802987041, + "grad_norm": 2.5108981132507324, + "learning_rate": 1e-06, + "loss": 1.0616, + "mean_token_accuracy": 0.6807492971420288, + "num_tokens": 68993832.0, + "step": 2716 + }, + { + "epoch": 0.29837469800131783, + "grad_norm": 2.0575406551361084, + "learning_rate": 1e-06, + "loss": 1.0298, + "mean_token_accuracy": 0.6863380074501038, + "num_tokens": 69025030.0, + "step": 2717 + }, + { + "epoch": 0.2984845157039315, + "grad_norm": 2.4514503479003906, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7153404951095581, + "num_tokens": 69046262.0, + "step": 2718 + }, + { + "epoch": 0.2985943334065451, + "grad_norm": 2.5738394260406494, + "learning_rate": 1e-06, + "loss": 0.9534, + "mean_token_accuracy": 0.7118275165557861, + "num_tokens": 69067366.0, + "step": 2719 + }, + { + "epoch": 0.2987041511091588, + "grad_norm": 2.5155227184295654, + "learning_rate": 1e-06, + "loss": 1.0637, + "mean_token_accuracy": 0.6774486899375916, + "num_tokens": 69090557.0, + "step": 2720 + }, + { + "epoch": 0.29881396881177247, + "grad_norm": 2.253019094467163, + "learning_rate": 1e-06, + "loss": 1.0034, + "mean_token_accuracy": 0.6952500343322754, + "num_tokens": 69118263.0, + "step": 2721 + }, + { + "epoch": 0.2989237865143861, + "grad_norm": 2.4060611724853516, + "learning_rate": 1e-06, + "loss": 1.0122, + "mean_token_accuracy": 0.6918778419494629, + "num_tokens": 69142697.0, + "step": 2722 + }, + { + "epoch": 0.29903360421699976, + "grad_norm": 2.318142890930176, + "learning_rate": 1e-06, + "loss": 0.8977, + "mean_token_accuracy": 0.7235007286071777, + "num_tokens": 69164422.0, + "step": 2723 + }, + { + "epoch": 0.29914342191961346, + "grad_norm": 2.2055728435516357, + "learning_rate": 1e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.7160634994506836, + "num_tokens": 69190745.0, + "step": 2724 + }, + { + "epoch": 0.2992532396222271, + "grad_norm": 2.257985830307007, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.699579119682312, + "num_tokens": 69218269.0, + "step": 2725 + }, + { + "epoch": 0.29936305732484075, + "grad_norm": 2.437163829803467, + "learning_rate": 1e-06, + "loss": 0.9724, + "mean_token_accuracy": 0.6997496485710144, + "num_tokens": 69240439.0, + "step": 2726 + }, + { + "epoch": 0.2994728750274544, + "grad_norm": 2.968838930130005, + "learning_rate": 1e-06, + "loss": 0.8938, + "mean_token_accuracy": 0.7187834978103638, + "num_tokens": 69256567.0, + "step": 2727 + }, + { + "epoch": 0.2995826927300681, + "grad_norm": 2.4145889282226562, + "learning_rate": 1e-06, + "loss": 1.0426, + "mean_token_accuracy": 0.6908283829689026, + "num_tokens": 69281217.0, + "step": 2728 + }, + { + "epoch": 0.29969251043268175, + "grad_norm": 2.465785026550293, + "learning_rate": 1e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.6977386474609375, + "num_tokens": 69303541.0, + "step": 2729 + }, + { + "epoch": 0.2998023281352954, + "grad_norm": 2.250701904296875, + "learning_rate": 1e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.7022463083267212, + "num_tokens": 69328923.0, + "step": 2730 + }, + { + "epoch": 0.2999121458379091, + "grad_norm": 2.391019821166992, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.7083074450492859, + "num_tokens": 69354240.0, + "step": 2731 + }, + { + "epoch": 0.30002196354052274, + "grad_norm": 2.4732284545898438, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7140336036682129, + "num_tokens": 69377025.0, + "step": 2732 + }, + { + "epoch": 0.3001317812431364, + "grad_norm": 2.4488091468811035, + "learning_rate": 1e-06, + "loss": 1.0378, + "mean_token_accuracy": 0.6973724365234375, + "num_tokens": 69400432.0, + "step": 2733 + }, + { + "epoch": 0.30024159894575003, + "grad_norm": 2.512160301208496, + "learning_rate": 1e-06, + "loss": 1.0354, + "mean_token_accuracy": 0.6956448554992676, + "num_tokens": 69425043.0, + "step": 2734 + }, + { + "epoch": 0.30035141664836373, + "grad_norm": 1.8548706769943237, + "learning_rate": 1e-06, + "loss": 0.9909, + "mean_token_accuracy": 0.7033355236053467, + "num_tokens": 69462173.0, + "step": 2735 + }, + { + "epoch": 0.3004612343509774, + "grad_norm": 2.2704458236694336, + "learning_rate": 1e-06, + "loss": 1.0685, + "mean_token_accuracy": 0.683702826499939, + "num_tokens": 69488284.0, + "step": 2736 + }, + { + "epoch": 0.300571052053591, + "grad_norm": 2.0499520301818848, + "learning_rate": 1e-06, + "loss": 0.922, + "mean_token_accuracy": 0.7292524576187134, + "num_tokens": 69516829.0, + "step": 2737 + }, + { + "epoch": 0.3006808697562047, + "grad_norm": 2.347940444946289, + "learning_rate": 1e-06, + "loss": 0.8885, + "mean_token_accuracy": 0.7287701964378357, + "num_tokens": 69538973.0, + "step": 2738 + }, + { + "epoch": 0.3007906874588184, + "grad_norm": 2.245697259902954, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.6978151798248291, + "num_tokens": 69562593.0, + "step": 2739 + }, + { + "epoch": 0.300900505161432, + "grad_norm": 2.1445488929748535, + "learning_rate": 1e-06, + "loss": 1.0125, + "mean_token_accuracy": 0.6930739879608154, + "num_tokens": 69591185.0, + "step": 2740 + }, + { + "epoch": 0.30101032286404567, + "grad_norm": 2.2430741786956787, + "learning_rate": 1e-06, + "loss": 1.0526, + "mean_token_accuracy": 0.6815245747566223, + "num_tokens": 69619918.0, + "step": 2741 + }, + { + "epoch": 0.30112014056665937, + "grad_norm": 2.270482063293457, + "learning_rate": 1e-06, + "loss": 1.0687, + "mean_token_accuracy": 0.6728160381317139, + "num_tokens": 69645607.0, + "step": 2742 + }, + { + "epoch": 0.301229958269273, + "grad_norm": 2.0635933876037598, + "learning_rate": 1e-06, + "loss": 1.0317, + "mean_token_accuracy": 0.6909514665603638, + "num_tokens": 69674044.0, + "step": 2743 + }, + { + "epoch": 0.30133977597188666, + "grad_norm": 2.387873888015747, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7046575546264648, + "num_tokens": 69697156.0, + "step": 2744 + }, + { + "epoch": 0.3014495936745003, + "grad_norm": 1.9992985725402832, + "learning_rate": 1e-06, + "loss": 1.0295, + "mean_token_accuracy": 0.6870845556259155, + "num_tokens": 69726616.0, + "step": 2745 + }, + { + "epoch": 0.301559411377114, + "grad_norm": 2.5990993976593018, + "learning_rate": 1e-06, + "loss": 1.0113, + "mean_token_accuracy": 0.7173325419425964, + "num_tokens": 69747104.0, + "step": 2746 + }, + { + "epoch": 0.30166922907972765, + "grad_norm": 2.6102352142333984, + "learning_rate": 1e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.7071651816368103, + "num_tokens": 69767214.0, + "step": 2747 + }, + { + "epoch": 0.3017790467823413, + "grad_norm": 2.7140066623687744, + "learning_rate": 1e-06, + "loss": 1.0581, + "mean_token_accuracy": 0.683933436870575, + "num_tokens": 69788420.0, + "step": 2748 + }, + { + "epoch": 0.301888864484955, + "grad_norm": 2.6114203929901123, + "learning_rate": 1e-06, + "loss": 0.9933, + "mean_token_accuracy": 0.6949001550674438, + "num_tokens": 69808964.0, + "step": 2749 + }, + { + "epoch": 0.30199868218756865, + "grad_norm": 2.182399272918701, + "learning_rate": 1e-06, + "loss": 1.0106, + "mean_token_accuracy": 0.6960426568984985, + "num_tokens": 69838537.0, + "step": 2750 + }, + { + "epoch": 0.3021084998901823, + "grad_norm": 2.3447647094726562, + "learning_rate": 1e-06, + "loss": 1.0296, + "mean_token_accuracy": 0.6844608783721924, + "num_tokens": 69861876.0, + "step": 2751 + }, + { + "epoch": 0.30221831759279594, + "grad_norm": 2.340056896209717, + "learning_rate": 1e-06, + "loss": 1.0178, + "mean_token_accuracy": 0.686206042766571, + "num_tokens": 69885370.0, + "step": 2752 + }, + { + "epoch": 0.30232813529540964, + "grad_norm": 2.0497212409973145, + "learning_rate": 1e-06, + "loss": 1.0271, + "mean_token_accuracy": 0.692876398563385, + "num_tokens": 69916992.0, + "step": 2753 + }, + { + "epoch": 0.3024379529980233, + "grad_norm": 2.1395819187164307, + "learning_rate": 1e-06, + "loss": 0.9953, + "mean_token_accuracy": 0.7083399295806885, + "num_tokens": 69943562.0, + "step": 2754 + }, + { + "epoch": 0.30254777070063693, + "grad_norm": 2.0699474811553955, + "learning_rate": 1e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.7178841829299927, + "num_tokens": 69973066.0, + "step": 2755 + }, + { + "epoch": 0.3026575884032506, + "grad_norm": 2.2195630073547363, + "learning_rate": 1e-06, + "loss": 1.0147, + "mean_token_accuracy": 0.6928859949111938, + "num_tokens": 69999388.0, + "step": 2756 + }, + { + "epoch": 0.3027674061058643, + "grad_norm": 2.282305955886841, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.6949412822723389, + "num_tokens": 70023128.0, + "step": 2757 + }, + { + "epoch": 0.3028772238084779, + "grad_norm": 1.9525376558303833, + "learning_rate": 1e-06, + "loss": 0.991, + "mean_token_accuracy": 0.7032114267349243, + "num_tokens": 70053108.0, + "step": 2758 + }, + { + "epoch": 0.30298704151109157, + "grad_norm": 2.1273701190948486, + "learning_rate": 1e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.7180976867675781, + "num_tokens": 70081682.0, + "step": 2759 + }, + { + "epoch": 0.30309685921370527, + "grad_norm": 2.6557087898254395, + "learning_rate": 1e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.7186598777770996, + "num_tokens": 70099740.0, + "step": 2760 + }, + { + "epoch": 0.3032066769163189, + "grad_norm": 2.2263119220733643, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.7116864919662476, + "num_tokens": 70125543.0, + "step": 2761 + }, + { + "epoch": 0.30331649461893256, + "grad_norm": 2.2864296436309814, + "learning_rate": 1e-06, + "loss": 1.0724, + "mean_token_accuracy": 0.6873654127120972, + "num_tokens": 70151049.0, + "step": 2762 + }, + { + "epoch": 0.3034263123215462, + "grad_norm": 2.3093392848968506, + "learning_rate": 1e-06, + "loss": 0.967, + "mean_token_accuracy": 0.7124634981155396, + "num_tokens": 70173164.0, + "step": 2763 + }, + { + "epoch": 0.3035361300241599, + "grad_norm": 2.3933072090148926, + "learning_rate": 1e-06, + "loss": 1.0294, + "mean_token_accuracy": 0.683725118637085, + "num_tokens": 70195820.0, + "step": 2764 + }, + { + "epoch": 0.30364594772677356, + "grad_norm": 2.1996853351593018, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7128055095672607, + "num_tokens": 70221278.0, + "step": 2765 + }, + { + "epoch": 0.3037557654293872, + "grad_norm": 2.4706435203552246, + "learning_rate": 1e-06, + "loss": 1.0193, + "mean_token_accuracy": 0.6907374858856201, + "num_tokens": 70246057.0, + "step": 2766 + }, + { + "epoch": 0.3038655831320009, + "grad_norm": 2.1718695163726807, + "learning_rate": 1e-06, + "loss": 1.0191, + "mean_token_accuracy": 0.6956989169120789, + "num_tokens": 70275391.0, + "step": 2767 + }, + { + "epoch": 0.30397540083461455, + "grad_norm": 2.3115999698638916, + "learning_rate": 1e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.7154943943023682, + "num_tokens": 70300569.0, + "step": 2768 + }, + { + "epoch": 0.3040852185372282, + "grad_norm": 2.288604736328125, + "learning_rate": 1e-06, + "loss": 1.0355, + "mean_token_accuracy": 0.6938384771347046, + "num_tokens": 70326565.0, + "step": 2769 + }, + { + "epoch": 0.30419503623984184, + "grad_norm": 2.4670495986938477, + "learning_rate": 1e-06, + "loss": 1.0744, + "mean_token_accuracy": 0.6816928386688232, + "num_tokens": 70348814.0, + "step": 2770 + }, + { + "epoch": 0.30430485394245554, + "grad_norm": 2.5864052772521973, + "learning_rate": 1e-06, + "loss": 1.021, + "mean_token_accuracy": 0.7009588479995728, + "num_tokens": 70369908.0, + "step": 2771 + }, + { + "epoch": 0.3044146716450692, + "grad_norm": 2.541067123413086, + "learning_rate": 1e-06, + "loss": 1.0451, + "mean_token_accuracy": 0.6881380677223206, + "num_tokens": 70392419.0, + "step": 2772 + }, + { + "epoch": 0.30452448934768284, + "grad_norm": 2.2169835567474365, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.6962777376174927, + "num_tokens": 70419104.0, + "step": 2773 + }, + { + "epoch": 0.3046343070502965, + "grad_norm": 2.546088933944702, + "learning_rate": 1e-06, + "loss": 0.9687, + "mean_token_accuracy": 0.70713210105896, + "num_tokens": 70441218.0, + "step": 2774 + }, + { + "epoch": 0.3047441247529102, + "grad_norm": 2.0484540462493896, + "learning_rate": 1e-06, + "loss": 1.0474, + "mean_token_accuracy": 0.6829034686088562, + "num_tokens": 70471995.0, + "step": 2775 + }, + { + "epoch": 0.30485394245552383, + "grad_norm": 2.0548572540283203, + "learning_rate": 1e-06, + "loss": 1.0676, + "mean_token_accuracy": 0.6845260858535767, + "num_tokens": 70501625.0, + "step": 2776 + }, + { + "epoch": 0.3049637601581375, + "grad_norm": 2.6919074058532715, + "learning_rate": 1e-06, + "loss": 0.8821, + "mean_token_accuracy": 0.7277467846870422, + "num_tokens": 70521058.0, + "step": 2777 + }, + { + "epoch": 0.3050735778607512, + "grad_norm": 2.265941619873047, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7050234079360962, + "num_tokens": 70544958.0, + "step": 2778 + }, + { + "epoch": 0.3051833955633648, + "grad_norm": 2.0565290451049805, + "learning_rate": 1e-06, + "loss": 0.9635, + "mean_token_accuracy": 0.7059787511825562, + "num_tokens": 70572990.0, + "step": 2779 + }, + { + "epoch": 0.30529321326597847, + "grad_norm": 1.911101222038269, + "learning_rate": 1e-06, + "loss": 1.0876, + "mean_token_accuracy": 0.6807478070259094, + "num_tokens": 70606956.0, + "step": 2780 + }, + { + "epoch": 0.3054030309685921, + "grad_norm": 2.2092206478118896, + "learning_rate": 1e-06, + "loss": 1.0209, + "mean_token_accuracy": 0.6976091265678406, + "num_tokens": 70632290.0, + "step": 2781 + }, + { + "epoch": 0.3055128486712058, + "grad_norm": 2.2461163997650146, + "learning_rate": 1e-06, + "loss": 1.0079, + "mean_token_accuracy": 0.6921250820159912, + "num_tokens": 70659418.0, + "step": 2782 + }, + { + "epoch": 0.30562266637381946, + "grad_norm": 2.3863961696624756, + "learning_rate": 1e-06, + "loss": 1.0501, + "mean_token_accuracy": 0.685661792755127, + "num_tokens": 70683118.0, + "step": 2783 + }, + { + "epoch": 0.3057324840764331, + "grad_norm": 2.060908317565918, + "learning_rate": 1e-06, + "loss": 0.9042, + "mean_token_accuracy": 0.7139264345169067, + "num_tokens": 70709571.0, + "step": 2784 + }, + { + "epoch": 0.30584230177904675, + "grad_norm": 2.013889789581299, + "learning_rate": 1e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.7116564512252808, + "num_tokens": 70738519.0, + "step": 2785 + }, + { + "epoch": 0.30595211948166046, + "grad_norm": 1.9124521017074585, + "learning_rate": 1e-06, + "loss": 1.0061, + "mean_token_accuracy": 0.7042280435562134, + "num_tokens": 70773681.0, + "step": 2786 + }, + { + "epoch": 0.3060619371842741, + "grad_norm": 2.2873260974884033, + "learning_rate": 1e-06, + "loss": 1.0272, + "mean_token_accuracy": 0.6884012222290039, + "num_tokens": 70798827.0, + "step": 2787 + }, + { + "epoch": 0.30617175488688775, + "grad_norm": 2.1933753490448, + "learning_rate": 1e-06, + "loss": 0.898, + "mean_token_accuracy": 0.7264792919158936, + "num_tokens": 70824883.0, + "step": 2788 + }, + { + "epoch": 0.30628157258950145, + "grad_norm": 2.3115830421447754, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.6957254409790039, + "num_tokens": 70850316.0, + "step": 2789 + }, + { + "epoch": 0.3063913902921151, + "grad_norm": 2.2476093769073486, + "learning_rate": 1e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.7275999188423157, + "num_tokens": 70873921.0, + "step": 2790 + }, + { + "epoch": 0.30650120799472874, + "grad_norm": 2.4745984077453613, + "learning_rate": 1e-06, + "loss": 0.8931, + "mean_token_accuracy": 0.7283281087875366, + "num_tokens": 70894842.0, + "step": 2791 + }, + { + "epoch": 0.3066110256973424, + "grad_norm": 2.2108211517333984, + "learning_rate": 1e-06, + "loss": 1.0324, + "mean_token_accuracy": 0.6960883736610413, + "num_tokens": 70919633.0, + "step": 2792 + }, + { + "epoch": 0.3067208433999561, + "grad_norm": 2.22124981880188, + "learning_rate": 1e-06, + "loss": 1.0429, + "mean_token_accuracy": 0.6903456449508667, + "num_tokens": 70946502.0, + "step": 2793 + }, + { + "epoch": 0.30683066110256974, + "grad_norm": 2.4226651191711426, + "learning_rate": 1e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.7116606831550598, + "num_tokens": 70967503.0, + "step": 2794 + }, + { + "epoch": 0.3069404788051834, + "grad_norm": 2.3742611408233643, + "learning_rate": 1e-06, + "loss": 0.9552, + "mean_token_accuracy": 0.7156913876533508, + "num_tokens": 70990658.0, + "step": 2795 + }, + { + "epoch": 0.3070502965077971, + "grad_norm": 2.331782579421997, + "learning_rate": 1e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.7031630277633667, + "num_tokens": 71015561.0, + "step": 2796 + }, + { + "epoch": 0.30716011421041073, + "grad_norm": 2.2400290966033936, + "learning_rate": 1e-06, + "loss": 1.0688, + "mean_token_accuracy": 0.6818798184394836, + "num_tokens": 71042464.0, + "step": 2797 + }, + { + "epoch": 0.3072699319130244, + "grad_norm": 2.174128770828247, + "learning_rate": 1e-06, + "loss": 0.9773, + "mean_token_accuracy": 0.7095299959182739, + "num_tokens": 71070299.0, + "step": 2798 + }, + { + "epoch": 0.307379749615638, + "grad_norm": 2.4565861225128174, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.7051827907562256, + "num_tokens": 71092824.0, + "step": 2799 + }, + { + "epoch": 0.3074895673182517, + "grad_norm": 2.1735680103302, + "learning_rate": 1e-06, + "loss": 1.0809, + "mean_token_accuracy": 0.691183865070343, + "num_tokens": 71121480.0, + "step": 2800 + }, + { + "epoch": 0.30759938502086537, + "grad_norm": 2.0551700592041016, + "learning_rate": 1e-06, + "loss": 1.0015, + "mean_token_accuracy": 0.6994001865386963, + "num_tokens": 71151573.0, + "step": 2801 + }, + { + "epoch": 0.307709202723479, + "grad_norm": 2.141624927520752, + "learning_rate": 1e-06, + "loss": 1.0671, + "mean_token_accuracy": 0.6773326396942139, + "num_tokens": 71179748.0, + "step": 2802 + }, + { + "epoch": 0.30781902042609266, + "grad_norm": 2.27380108833313, + "learning_rate": 1e-06, + "loss": 1.1341, + "mean_token_accuracy": 0.6670585870742798, + "num_tokens": 71206427.0, + "step": 2803 + }, + { + "epoch": 0.30792883812870636, + "grad_norm": 2.0024099349975586, + "learning_rate": 1e-06, + "loss": 1.0668, + "mean_token_accuracy": 0.682753324508667, + "num_tokens": 71237359.0, + "step": 2804 + }, + { + "epoch": 0.30803865583132, + "grad_norm": 2.503486156463623, + "learning_rate": 1e-06, + "loss": 0.96, + "mean_token_accuracy": 0.7070229649543762, + "num_tokens": 71257186.0, + "step": 2805 + }, + { + "epoch": 0.30814847353393365, + "grad_norm": 2.0854270458221436, + "learning_rate": 1e-06, + "loss": 1.0123, + "mean_token_accuracy": 0.6958360075950623, + "num_tokens": 71286020.0, + "step": 2806 + }, + { + "epoch": 0.30825829123654735, + "grad_norm": 2.355882406234741, + "learning_rate": 1e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7238777279853821, + "num_tokens": 71308127.0, + "step": 2807 + }, + { + "epoch": 0.308368108939161, + "grad_norm": 2.068122148513794, + "learning_rate": 1e-06, + "loss": 0.8778, + "mean_token_accuracy": 0.7276219129562378, + "num_tokens": 71335016.0, + "step": 2808 + }, + { + "epoch": 0.30847792664177465, + "grad_norm": 2.4071273803710938, + "learning_rate": 1e-06, + "loss": 0.9753, + "mean_token_accuracy": 0.7017896771430969, + "num_tokens": 71356307.0, + "step": 2809 + }, + { + "epoch": 0.3085877443443883, + "grad_norm": 2.3056600093841553, + "learning_rate": 1e-06, + "loss": 1.0135, + "mean_token_accuracy": 0.7005782127380371, + "num_tokens": 71379910.0, + "step": 2810 + }, + { + "epoch": 0.308697562047002, + "grad_norm": 2.1346166133880615, + "learning_rate": 1e-06, + "loss": 1.0486, + "mean_token_accuracy": 0.6885108947753906, + "num_tokens": 71407478.0, + "step": 2811 + }, + { + "epoch": 0.30880737974961564, + "grad_norm": 2.2455556392669678, + "learning_rate": 1e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.708808958530426, + "num_tokens": 71434052.0, + "step": 2812 + }, + { + "epoch": 0.3089171974522293, + "grad_norm": 2.0747127532958984, + "learning_rate": 1e-06, + "loss": 1.0717, + "mean_token_accuracy": 0.6765300035476685, + "num_tokens": 71465501.0, + "step": 2813 + }, + { + "epoch": 0.309027015154843, + "grad_norm": 1.9518654346466064, + "learning_rate": 1e-06, + "loss": 1.0928, + "mean_token_accuracy": 0.6746103763580322, + "num_tokens": 71500628.0, + "step": 2814 + }, + { + "epoch": 0.30913683285745663, + "grad_norm": 2.281667947769165, + "learning_rate": 1e-06, + "loss": 1.0692, + "mean_token_accuracy": 0.6818169355392456, + "num_tokens": 71525353.0, + "step": 2815 + }, + { + "epoch": 0.3092466505600703, + "grad_norm": 2.3732986450195312, + "learning_rate": 1e-06, + "loss": 1.0391, + "mean_token_accuracy": 0.7040902376174927, + "num_tokens": 71551156.0, + "step": 2816 + }, + { + "epoch": 0.3093564682626839, + "grad_norm": 2.192047357559204, + "learning_rate": 1e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.7006338834762573, + "num_tokens": 71578828.0, + "step": 2817 + }, + { + "epoch": 0.3094662859652976, + "grad_norm": 2.4709208011627197, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.701355516910553, + "num_tokens": 71599985.0, + "step": 2818 + }, + { + "epoch": 0.3095761036679113, + "grad_norm": 2.395944356918335, + "learning_rate": 1e-06, + "loss": 1.0724, + "mean_token_accuracy": 0.6833678483963013, + "num_tokens": 71624518.0, + "step": 2819 + }, + { + "epoch": 0.3096859213705249, + "grad_norm": 2.336862087249756, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.6940997838973999, + "num_tokens": 71648496.0, + "step": 2820 + }, + { + "epoch": 0.30979573907313857, + "grad_norm": 1.9717174768447876, + "learning_rate": 1e-06, + "loss": 1.0495, + "mean_token_accuracy": 0.6875234246253967, + "num_tokens": 71682160.0, + "step": 2821 + }, + { + "epoch": 0.30990555677575227, + "grad_norm": 2.2403037548065186, + "learning_rate": 1e-06, + "loss": 1.0641, + "mean_token_accuracy": 0.6775587797164917, + "num_tokens": 71708708.0, + "step": 2822 + }, + { + "epoch": 0.3100153744783659, + "grad_norm": 2.330035448074341, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.7147822380065918, + "num_tokens": 71731305.0, + "step": 2823 + }, + { + "epoch": 0.31012519218097956, + "grad_norm": 2.1517908573150635, + "learning_rate": 1e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.6971545219421387, + "num_tokens": 71756981.0, + "step": 2824 + }, + { + "epoch": 0.31023500988359326, + "grad_norm": 2.1423933506011963, + "learning_rate": 1e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.7060514688491821, + "num_tokens": 71785043.0, + "step": 2825 + }, + { + "epoch": 0.3103448275862069, + "grad_norm": 2.015242099761963, + "learning_rate": 1e-06, + "loss": 1.0781, + "mean_token_accuracy": 0.6736123561859131, + "num_tokens": 71816881.0, + "step": 2826 + }, + { + "epoch": 0.31045464528882055, + "grad_norm": 2.578552484512329, + "learning_rate": 1e-06, + "loss": 0.8759, + "mean_token_accuracy": 0.7204314470291138, + "num_tokens": 71833620.0, + "step": 2827 + }, + { + "epoch": 0.3105644629914342, + "grad_norm": 2.1449410915374756, + "learning_rate": 1e-06, + "loss": 1.0416, + "mean_token_accuracy": 0.6893168091773987, + "num_tokens": 71862012.0, + "step": 2828 + }, + { + "epoch": 0.3106742806940479, + "grad_norm": 2.260746717453003, + "learning_rate": 1e-06, + "loss": 1.0427, + "mean_token_accuracy": 0.6882467865943909, + "num_tokens": 71886534.0, + "step": 2829 + }, + { + "epoch": 0.31078409839666155, + "grad_norm": 2.1198062896728516, + "learning_rate": 1e-06, + "loss": 1.0633, + "mean_token_accuracy": 0.6814208626747131, + "num_tokens": 71915391.0, + "step": 2830 + }, + { + "epoch": 0.3108939160992752, + "grad_norm": 2.254282236099243, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7092707753181458, + "num_tokens": 71940348.0, + "step": 2831 + }, + { + "epoch": 0.31100373380188884, + "grad_norm": 2.2111220359802246, + "learning_rate": 1e-06, + "loss": 1.0078, + "mean_token_accuracy": 0.7075259685516357, + "num_tokens": 71965544.0, + "step": 2832 + }, + { + "epoch": 0.31111355150450254, + "grad_norm": 2.2777860164642334, + "learning_rate": 1e-06, + "loss": 0.9722, + "mean_token_accuracy": 0.7047556638717651, + "num_tokens": 71991242.0, + "step": 2833 + }, + { + "epoch": 0.3112233692071162, + "grad_norm": 2.362260580062866, + "learning_rate": 1e-06, + "loss": 1.0315, + "mean_token_accuracy": 0.6869655847549438, + "num_tokens": 72016054.0, + "step": 2834 + }, + { + "epoch": 0.31133318690972983, + "grad_norm": 2.2742855548858643, + "learning_rate": 1e-06, + "loss": 1.0546, + "mean_token_accuracy": 0.678231418132782, + "num_tokens": 72040883.0, + "step": 2835 + }, + { + "epoch": 0.31144300461234353, + "grad_norm": 2.192927837371826, + "learning_rate": 1e-06, + "loss": 0.8845, + "mean_token_accuracy": 0.7167889475822449, + "num_tokens": 72067074.0, + "step": 2836 + }, + { + "epoch": 0.3115528223149572, + "grad_norm": 2.29773211479187, + "learning_rate": 1e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.6962465047836304, + "num_tokens": 72092333.0, + "step": 2837 + }, + { + "epoch": 0.3116626400175708, + "grad_norm": 2.313018560409546, + "learning_rate": 1e-06, + "loss": 1.0135, + "mean_token_accuracy": 0.6920616626739502, + "num_tokens": 72117327.0, + "step": 2838 + }, + { + "epoch": 0.31177245772018447, + "grad_norm": 2.214945077896118, + "learning_rate": 1e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.696235179901123, + "num_tokens": 72143016.0, + "step": 2839 + }, + { + "epoch": 0.31188227542279817, + "grad_norm": 2.321646213531494, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.6960971355438232, + "num_tokens": 72165253.0, + "step": 2840 + }, + { + "epoch": 0.3119920931254118, + "grad_norm": 2.2638702392578125, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.705030083656311, + "num_tokens": 72190142.0, + "step": 2841 + }, + { + "epoch": 0.31210191082802546, + "grad_norm": 2.1899428367614746, + "learning_rate": 1e-06, + "loss": 1.0053, + "mean_token_accuracy": 0.6982280015945435, + "num_tokens": 72216786.0, + "step": 2842 + }, + { + "epoch": 0.31221172853063917, + "grad_norm": 1.8462927341461182, + "learning_rate": 1e-06, + "loss": 1.0178, + "mean_token_accuracy": 0.6950230598449707, + "num_tokens": 72250405.0, + "step": 2843 + }, + { + "epoch": 0.3123215462332528, + "grad_norm": 2.3870582580566406, + "learning_rate": 1e-06, + "loss": 1.0001, + "mean_token_accuracy": 0.6987887620925903, + "num_tokens": 72273326.0, + "step": 2844 + }, + { + "epoch": 0.31243136393586646, + "grad_norm": 2.173316240310669, + "learning_rate": 1e-06, + "loss": 1.0142, + "mean_token_accuracy": 0.693474292755127, + "num_tokens": 72301626.0, + "step": 2845 + }, + { + "epoch": 0.3125411816384801, + "grad_norm": 2.207052707672119, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7115604877471924, + "num_tokens": 72325936.0, + "step": 2846 + }, + { + "epoch": 0.3126509993410938, + "grad_norm": 2.3782410621643066, + "learning_rate": 1e-06, + "loss": 0.99, + "mean_token_accuracy": 0.7140918970108032, + "num_tokens": 72347706.0, + "step": 2847 + }, + { + "epoch": 0.31276081704370745, + "grad_norm": 2.1910741329193115, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.705570638179779, + "num_tokens": 72373819.0, + "step": 2848 + }, + { + "epoch": 0.3128706347463211, + "grad_norm": 2.2432198524475098, + "learning_rate": 1e-06, + "loss": 1.0566, + "mean_token_accuracy": 0.685833752155304, + "num_tokens": 72400218.0, + "step": 2849 + }, + { + "epoch": 0.31298045244893474, + "grad_norm": 2.269404172897339, + "learning_rate": 1e-06, + "loss": 1.0132, + "mean_token_accuracy": 0.698464035987854, + "num_tokens": 72424503.0, + "step": 2850 + }, + { + "epoch": 0.31309027015154844, + "grad_norm": 2.2882633209228516, + "learning_rate": 1e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.7061007022857666, + "num_tokens": 72448908.0, + "step": 2851 + }, + { + "epoch": 0.3132000878541621, + "grad_norm": 1.995565414428711, + "learning_rate": 1e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.7018121480941772, + "num_tokens": 72477116.0, + "step": 2852 + }, + { + "epoch": 0.31330990555677574, + "grad_norm": 2.480633020401001, + "learning_rate": 1e-06, + "loss": 1.0259, + "mean_token_accuracy": 0.6853057742118835, + "num_tokens": 72500818.0, + "step": 2853 + }, + { + "epoch": 0.31341972325938944, + "grad_norm": 2.1521804332733154, + "learning_rate": 1e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.6911796927452087, + "num_tokens": 72529650.0, + "step": 2854 + }, + { + "epoch": 0.3135295409620031, + "grad_norm": 2.0702571868896484, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7152029871940613, + "num_tokens": 72559572.0, + "step": 2855 + }, + { + "epoch": 0.31363935866461673, + "grad_norm": 2.0775914192199707, + "learning_rate": 1e-06, + "loss": 1.0151, + "mean_token_accuracy": 0.6998132467269897, + "num_tokens": 72590207.0, + "step": 2856 + }, + { + "epoch": 0.3137491763672304, + "grad_norm": 2.2697980403900146, + "learning_rate": 1e-06, + "loss": 1.0291, + "mean_token_accuracy": 0.6922491788864136, + "num_tokens": 72614886.0, + "step": 2857 + }, + { + "epoch": 0.3138589940698441, + "grad_norm": 1.979601263999939, + "learning_rate": 1e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.7168947458267212, + "num_tokens": 72645665.0, + "step": 2858 + }, + { + "epoch": 0.3139688117724577, + "grad_norm": 2.3709921836853027, + "learning_rate": 1e-06, + "loss": 0.912, + "mean_token_accuracy": 0.7274010181427002, + "num_tokens": 72669977.0, + "step": 2859 + }, + { + "epoch": 0.31407862947507137, + "grad_norm": 2.291809320449829, + "learning_rate": 1e-06, + "loss": 0.9738, + "mean_token_accuracy": 0.7079793214797974, + "num_tokens": 72692935.0, + "step": 2860 + }, + { + "epoch": 0.314188447177685, + "grad_norm": 2.0648279190063477, + "learning_rate": 1e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.6947818994522095, + "num_tokens": 72723426.0, + "step": 2861 + }, + { + "epoch": 0.3142982648802987, + "grad_norm": 2.3068008422851562, + "learning_rate": 1e-06, + "loss": 0.9362, + "mean_token_accuracy": 0.7053331136703491, + "num_tokens": 72746366.0, + "step": 2862 + }, + { + "epoch": 0.31440808258291236, + "grad_norm": 2.2514941692352295, + "learning_rate": 1e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.6976332068443298, + "num_tokens": 72771761.0, + "step": 2863 + }, + { + "epoch": 0.314517900285526, + "grad_norm": 2.3085432052612305, + "learning_rate": 1e-06, + "loss": 1.0321, + "mean_token_accuracy": 0.6878083944320679, + "num_tokens": 72796521.0, + "step": 2864 + }, + { + "epoch": 0.3146277179881397, + "grad_norm": 2.240332841873169, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7137863636016846, + "num_tokens": 72821563.0, + "step": 2865 + }, + { + "epoch": 0.31473753569075336, + "grad_norm": 2.3650381565093994, + "learning_rate": 1e-06, + "loss": 1.0396, + "mean_token_accuracy": 0.6819677352905273, + "num_tokens": 72846480.0, + "step": 2866 + }, + { + "epoch": 0.314847353393367, + "grad_norm": 2.3503568172454834, + "learning_rate": 1e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.71547532081604, + "num_tokens": 72869808.0, + "step": 2867 + }, + { + "epoch": 0.31495717109598065, + "grad_norm": 2.2626192569732666, + "learning_rate": 1e-06, + "loss": 1.0025, + "mean_token_accuracy": 0.6961648464202881, + "num_tokens": 72893112.0, + "step": 2868 + }, + { + "epoch": 0.31506698879859435, + "grad_norm": 2.5750892162323, + "learning_rate": 1e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.7060724496841431, + "num_tokens": 72914363.0, + "step": 2869 + }, + { + "epoch": 0.315176806501208, + "grad_norm": 2.2327466011047363, + "learning_rate": 1e-06, + "loss": 1.0109, + "mean_token_accuracy": 0.6941118836402893, + "num_tokens": 72938162.0, + "step": 2870 + }, + { + "epoch": 0.31528662420382164, + "grad_norm": 1.8982868194580078, + "learning_rate": 1e-06, + "loss": 1.0846, + "mean_token_accuracy": 0.678645133972168, + "num_tokens": 72972808.0, + "step": 2871 + }, + { + "epoch": 0.31539644190643534, + "grad_norm": 2.228896141052246, + "learning_rate": 1e-06, + "loss": 1.0188, + "mean_token_accuracy": 0.6889296770095825, + "num_tokens": 72998921.0, + "step": 2872 + }, + { + "epoch": 0.315506259609049, + "grad_norm": 2.1417934894561768, + "learning_rate": 1e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.7082328796386719, + "num_tokens": 73024951.0, + "step": 2873 + }, + { + "epoch": 0.31561607731166264, + "grad_norm": 2.083235025405884, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7172141075134277, + "num_tokens": 73052809.0, + "step": 2874 + }, + { + "epoch": 0.3157258950142763, + "grad_norm": 2.3318276405334473, + "learning_rate": 1e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.7058709859848022, + "num_tokens": 73076670.0, + "step": 2875 + }, + { + "epoch": 0.31583571271689, + "grad_norm": 2.410417318344116, + "learning_rate": 1e-06, + "loss": 0.9958, + "mean_token_accuracy": 0.6965776681900024, + "num_tokens": 73102948.0, + "step": 2876 + }, + { + "epoch": 0.31594553041950363, + "grad_norm": 2.3644447326660156, + "learning_rate": 1e-06, + "loss": 1.0063, + "mean_token_accuracy": 0.69626784324646, + "num_tokens": 73126807.0, + "step": 2877 + }, + { + "epoch": 0.3160553481221173, + "grad_norm": 2.1734416484832764, + "learning_rate": 1e-06, + "loss": 1.0184, + "mean_token_accuracy": 0.6903339624404907, + "num_tokens": 73154808.0, + "step": 2878 + }, + { + "epoch": 0.3161651658247309, + "grad_norm": 2.3793904781341553, + "learning_rate": 1e-06, + "loss": 0.8987, + "mean_token_accuracy": 0.7219259738922119, + "num_tokens": 73178038.0, + "step": 2879 + }, + { + "epoch": 0.3162749835273446, + "grad_norm": 2.254694700241089, + "learning_rate": 1e-06, + "loss": 0.8819, + "mean_token_accuracy": 0.7296256422996521, + "num_tokens": 73199942.0, + "step": 2880 + }, + { + "epoch": 0.31638480122995827, + "grad_norm": 2.2005462646484375, + "learning_rate": 1e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7184572219848633, + "num_tokens": 73225074.0, + "step": 2881 + }, + { + "epoch": 0.3164946189325719, + "grad_norm": 2.1244513988494873, + "learning_rate": 1e-06, + "loss": 1.0739, + "mean_token_accuracy": 0.6767237186431885, + "num_tokens": 73254815.0, + "step": 2882 + }, + { + "epoch": 0.3166044366351856, + "grad_norm": 2.33231258392334, + "learning_rate": 1e-06, + "loss": 1.0845, + "mean_token_accuracy": 0.6743688583374023, + "num_tokens": 73279339.0, + "step": 2883 + }, + { + "epoch": 0.31671425433779926, + "grad_norm": 2.3083724975585938, + "learning_rate": 1e-06, + "loss": 0.9266, + "mean_token_accuracy": 0.7171825170516968, + "num_tokens": 73302955.0, + "step": 2884 + }, + { + "epoch": 0.3168240720404129, + "grad_norm": 2.3164761066436768, + "learning_rate": 1e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.7226636409759521, + "num_tokens": 73324926.0, + "step": 2885 + }, + { + "epoch": 0.31693388974302655, + "grad_norm": 2.0476930141448975, + "learning_rate": 1e-06, + "loss": 1.0245, + "mean_token_accuracy": 0.6937443017959595, + "num_tokens": 73356281.0, + "step": 2886 + }, + { + "epoch": 0.31704370744564025, + "grad_norm": 2.271141529083252, + "learning_rate": 1e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.7010188102722168, + "num_tokens": 73381750.0, + "step": 2887 + }, + { + "epoch": 0.3171535251482539, + "grad_norm": 2.2954342365264893, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7102868556976318, + "num_tokens": 73407113.0, + "step": 2888 + }, + { + "epoch": 0.31726334285086755, + "grad_norm": 2.2843620777130127, + "learning_rate": 1e-06, + "loss": 1.0128, + "mean_token_accuracy": 0.690706193447113, + "num_tokens": 73430800.0, + "step": 2889 + }, + { + "epoch": 0.31737316055348125, + "grad_norm": 2.0111279487609863, + "learning_rate": 1e-06, + "loss": 1.0147, + "mean_token_accuracy": 0.6924269199371338, + "num_tokens": 73461684.0, + "step": 2890 + }, + { + "epoch": 0.3174829782560949, + "grad_norm": 2.1056675910949707, + "learning_rate": 1e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.7002373933792114, + "num_tokens": 73487971.0, + "step": 2891 + }, + { + "epoch": 0.31759279595870854, + "grad_norm": 2.6408944129943848, + "learning_rate": 1e-06, + "loss": 0.9977, + "mean_token_accuracy": 0.7065314054489136, + "num_tokens": 73508272.0, + "step": 2892 + }, + { + "epoch": 0.3177026136613222, + "grad_norm": 2.293693780899048, + "learning_rate": 1e-06, + "loss": 1.0324, + "mean_token_accuracy": 0.6843343377113342, + "num_tokens": 73536682.0, + "step": 2893 + }, + { + "epoch": 0.3178124313639359, + "grad_norm": 2.1158087253570557, + "learning_rate": 1e-06, + "loss": 1.0014, + "mean_token_accuracy": 0.7081308364868164, + "num_tokens": 73562802.0, + "step": 2894 + }, + { + "epoch": 0.31792224906654953, + "grad_norm": 2.444669246673584, + "learning_rate": 1e-06, + "loss": 1.0403, + "mean_token_accuracy": 0.6882807016372681, + "num_tokens": 73587705.0, + "step": 2895 + }, + { + "epoch": 0.3180320667691632, + "grad_norm": 2.1513519287109375, + "learning_rate": 1e-06, + "loss": 1.1154, + "mean_token_accuracy": 0.678001344203949, + "num_tokens": 73617565.0, + "step": 2896 + }, + { + "epoch": 0.3181418844717768, + "grad_norm": 2.2542591094970703, + "learning_rate": 1e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.6988726258277893, + "num_tokens": 73642300.0, + "step": 2897 + }, + { + "epoch": 0.3182517021743905, + "grad_norm": 2.14717173576355, + "learning_rate": 1e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.6977413892745972, + "num_tokens": 73667513.0, + "step": 2898 + }, + { + "epoch": 0.3183615198770042, + "grad_norm": 2.330270290374756, + "learning_rate": 1e-06, + "loss": 0.9474, + "mean_token_accuracy": 0.7092840671539307, + "num_tokens": 73689803.0, + "step": 2899 + }, + { + "epoch": 0.3184713375796178, + "grad_norm": 2.3826828002929688, + "learning_rate": 1e-06, + "loss": 0.9959, + "mean_token_accuracy": 0.7033537030220032, + "num_tokens": 73712828.0, + "step": 2900 + }, + { + "epoch": 0.3185811552822315, + "grad_norm": 2.2164196968078613, + "learning_rate": 1e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.7078869342803955, + "num_tokens": 73739912.0, + "step": 2901 + }, + { + "epoch": 0.31869097298484517, + "grad_norm": 2.2569963932037354, + "learning_rate": 1e-06, + "loss": 1.0496, + "mean_token_accuracy": 0.6817559003829956, + "num_tokens": 73766048.0, + "step": 2902 + }, + { + "epoch": 0.3188007906874588, + "grad_norm": 2.2182490825653076, + "learning_rate": 1e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.6991890072822571, + "num_tokens": 73791909.0, + "step": 2903 + }, + { + "epoch": 0.31891060839007246, + "grad_norm": 2.926131010055542, + "learning_rate": 1e-06, + "loss": 1.0258, + "mean_token_accuracy": 0.6900579929351807, + "num_tokens": 73810108.0, + "step": 2904 + }, + { + "epoch": 0.31902042609268616, + "grad_norm": 2.5007314682006836, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7146896719932556, + "num_tokens": 73831905.0, + "step": 2905 + }, + { + "epoch": 0.3191302437952998, + "grad_norm": 2.049677610397339, + "learning_rate": 1e-06, + "loss": 1.0078, + "mean_token_accuracy": 0.6977471113204956, + "num_tokens": 73860180.0, + "step": 2906 + }, + { + "epoch": 0.31924006149791345, + "grad_norm": 2.212477922439575, + "learning_rate": 1e-06, + "loss": 1.1087, + "mean_token_accuracy": 0.6652066707611084, + "num_tokens": 73887962.0, + "step": 2907 + }, + { + "epoch": 0.3193498792005271, + "grad_norm": 2.3660030364990234, + "learning_rate": 1e-06, + "loss": 0.8741, + "mean_token_accuracy": 0.7206053137779236, + "num_tokens": 73912004.0, + "step": 2908 + }, + { + "epoch": 0.3194596969031408, + "grad_norm": 2.4778192043304443, + "learning_rate": 1e-06, + "loss": 0.9657, + "mean_token_accuracy": 0.7046250700950623, + "num_tokens": 73933802.0, + "step": 2909 + }, + { + "epoch": 0.31956951460575445, + "grad_norm": 2.4203453063964844, + "learning_rate": 1e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.6985176205635071, + "num_tokens": 73957284.0, + "step": 2910 + }, + { + "epoch": 0.3196793323083681, + "grad_norm": 2.2323668003082275, + "learning_rate": 1e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.7134292125701904, + "num_tokens": 73980415.0, + "step": 2911 + }, + { + "epoch": 0.3197891500109818, + "grad_norm": 2.386601686477661, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7068691253662109, + "num_tokens": 74003488.0, + "step": 2912 + }, + { + "epoch": 0.31989896771359544, + "grad_norm": 2.2773964405059814, + "learning_rate": 1e-06, + "loss": 0.9627, + "mean_token_accuracy": 0.7073966264724731, + "num_tokens": 74028433.0, + "step": 2913 + }, + { + "epoch": 0.3200087854162091, + "grad_norm": 2.2448465824127197, + "learning_rate": 1e-06, + "loss": 1.003, + "mean_token_accuracy": 0.7000933289527893, + "num_tokens": 74056606.0, + "step": 2914 + }, + { + "epoch": 0.32011860311882273, + "grad_norm": 2.018981695175171, + "learning_rate": 1e-06, + "loss": 1.0931, + "mean_token_accuracy": 0.6734360456466675, + "num_tokens": 74086403.0, + "step": 2915 + }, + { + "epoch": 0.32022842082143643, + "grad_norm": 2.52358078956604, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7185620665550232, + "num_tokens": 74106970.0, + "step": 2916 + }, + { + "epoch": 0.3203382385240501, + "grad_norm": 2.333566427230835, + "learning_rate": 1e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.7106356620788574, + "num_tokens": 74130540.0, + "step": 2917 + }, + { + "epoch": 0.3204480562266637, + "grad_norm": 2.145219564437866, + "learning_rate": 1e-06, + "loss": 0.968, + "mean_token_accuracy": 0.7038807272911072, + "num_tokens": 74158096.0, + "step": 2918 + }, + { + "epoch": 0.3205578739292774, + "grad_norm": 2.0960240364074707, + "learning_rate": 1e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.7225939035415649, + "num_tokens": 74184356.0, + "step": 2919 + }, + { + "epoch": 0.32066769163189107, + "grad_norm": 2.1746363639831543, + "learning_rate": 1e-06, + "loss": 1.0066, + "mean_token_accuracy": 0.6895437836647034, + "num_tokens": 74210099.0, + "step": 2920 + }, + { + "epoch": 0.3207775093345047, + "grad_norm": 2.46163272857666, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.6995625495910645, + "num_tokens": 74231160.0, + "step": 2921 + }, + { + "epoch": 0.32088732703711836, + "grad_norm": 2.0081706047058105, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.7031410336494446, + "num_tokens": 74260374.0, + "step": 2922 + }, + { + "epoch": 0.32099714473973207, + "grad_norm": 1.9823665618896484, + "learning_rate": 1e-06, + "loss": 1.0524, + "mean_token_accuracy": 0.6963462829589844, + "num_tokens": 74292054.0, + "step": 2923 + }, + { + "epoch": 0.3211069624423457, + "grad_norm": 2.19643497467041, + "learning_rate": 1e-06, + "loss": 1.0492, + "mean_token_accuracy": 0.6860083341598511, + "num_tokens": 74318944.0, + "step": 2924 + }, + { + "epoch": 0.32121678014495936, + "grad_norm": 2.064394235610962, + "learning_rate": 1e-06, + "loss": 0.978, + "mean_token_accuracy": 0.7040536403656006, + "num_tokens": 74348189.0, + "step": 2925 + }, + { + "epoch": 0.321326597847573, + "grad_norm": 2.3041880130767822, + "learning_rate": 1e-06, + "loss": 1.0419, + "mean_token_accuracy": 0.6846159100532532, + "num_tokens": 74372525.0, + "step": 2926 + }, + { + "epoch": 0.3214364155501867, + "grad_norm": 2.730314016342163, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7061454057693481, + "num_tokens": 74389822.0, + "step": 2927 + }, + { + "epoch": 0.32154623325280035, + "grad_norm": 2.11322283744812, + "learning_rate": 1e-06, + "loss": 1.0876, + "mean_token_accuracy": 0.6753383874893188, + "num_tokens": 74420028.0, + "step": 2928 + }, + { + "epoch": 0.321656050955414, + "grad_norm": 2.5553030967712402, + "learning_rate": 1e-06, + "loss": 0.8921, + "mean_token_accuracy": 0.725938618183136, + "num_tokens": 74443375.0, + "step": 2929 + }, + { + "epoch": 0.3217658686580277, + "grad_norm": 2.329092264175415, + "learning_rate": 1e-06, + "loss": 0.9825, + "mean_token_accuracy": 0.7121782302856445, + "num_tokens": 74467215.0, + "step": 2930 + }, + { + "epoch": 0.32187568636064134, + "grad_norm": 2.3013558387756348, + "learning_rate": 1e-06, + "loss": 1.0714, + "mean_token_accuracy": 0.6871229410171509, + "num_tokens": 74491739.0, + "step": 2931 + }, + { + "epoch": 0.321985504063255, + "grad_norm": 2.442957878112793, + "learning_rate": 1e-06, + "loss": 0.9808, + "mean_token_accuracy": 0.6981117725372314, + "num_tokens": 74512326.0, + "step": 2932 + }, + { + "epoch": 0.32209532176586864, + "grad_norm": 2.71362042427063, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7123188972473145, + "num_tokens": 74529721.0, + "step": 2933 + }, + { + "epoch": 0.32220513946848234, + "grad_norm": 2.2948293685913086, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7099430561065674, + "num_tokens": 74553300.0, + "step": 2934 + }, + { + "epoch": 0.322314957171096, + "grad_norm": 2.443059206008911, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7150577306747437, + "num_tokens": 74574746.0, + "step": 2935 + }, + { + "epoch": 0.32242477487370963, + "grad_norm": 2.1971700191497803, + "learning_rate": 1e-06, + "loss": 1.003, + "mean_token_accuracy": 0.7007511854171753, + "num_tokens": 74602617.0, + "step": 2936 + }, + { + "epoch": 0.3225345925763233, + "grad_norm": 2.0037388801574707, + "learning_rate": 1e-06, + "loss": 1.0286, + "mean_token_accuracy": 0.6899248957633972, + "num_tokens": 74635718.0, + "step": 2937 + }, + { + "epoch": 0.322644410278937, + "grad_norm": 2.1369566917419434, + "learning_rate": 1e-06, + "loss": 1.0021, + "mean_token_accuracy": 0.6974537372589111, + "num_tokens": 74662603.0, + "step": 2938 + }, + { + "epoch": 0.3227542279815506, + "grad_norm": 2.190159559249878, + "learning_rate": 1e-06, + "loss": 1.0621, + "mean_token_accuracy": 0.6820462942123413, + "num_tokens": 74692324.0, + "step": 2939 + }, + { + "epoch": 0.32286404568416427, + "grad_norm": 2.250356435775757, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7189980745315552, + "num_tokens": 74716569.0, + "step": 2940 + }, + { + "epoch": 0.32297386338677797, + "grad_norm": 2.4403674602508545, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7110296487808228, + "num_tokens": 74738165.0, + "step": 2941 + }, + { + "epoch": 0.3230836810893916, + "grad_norm": 2.1398813724517822, + "learning_rate": 1e-06, + "loss": 1.0398, + "mean_token_accuracy": 0.6867210865020752, + "num_tokens": 74766582.0, + "step": 2942 + }, + { + "epoch": 0.32319349879200526, + "grad_norm": 2.1424694061279297, + "learning_rate": 1e-06, + "loss": 1.1237, + "mean_token_accuracy": 0.6646894216537476, + "num_tokens": 74797715.0, + "step": 2943 + }, + { + "epoch": 0.3233033164946189, + "grad_norm": 2.506244659423828, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.7108050584793091, + "num_tokens": 74819408.0, + "step": 2944 + }, + { + "epoch": 0.3234131341972326, + "grad_norm": 2.5203795433044434, + "learning_rate": 1e-06, + "loss": 0.9959, + "mean_token_accuracy": 0.6956919431686401, + "num_tokens": 74841038.0, + "step": 2945 + }, + { + "epoch": 0.32352295189984626, + "grad_norm": 2.1080126762390137, + "learning_rate": 1e-06, + "loss": 1.0397, + "mean_token_accuracy": 0.6913108229637146, + "num_tokens": 74866713.0, + "step": 2946 + }, + { + "epoch": 0.3236327696024599, + "grad_norm": 2.102849006652832, + "learning_rate": 1e-06, + "loss": 1.0634, + "mean_token_accuracy": 0.6835154294967651, + "num_tokens": 74897219.0, + "step": 2947 + }, + { + "epoch": 0.3237425873050736, + "grad_norm": 2.6000826358795166, + "learning_rate": 1e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.7046409845352173, + "num_tokens": 74917167.0, + "step": 2948 + }, + { + "epoch": 0.32385240500768725, + "grad_norm": 2.463463544845581, + "learning_rate": 1e-06, + "loss": 0.983, + "mean_token_accuracy": 0.7027263641357422, + "num_tokens": 74939346.0, + "step": 2949 + }, + { + "epoch": 0.3239622227103009, + "grad_norm": 2.297600269317627, + "learning_rate": 1e-06, + "loss": 1.0306, + "mean_token_accuracy": 0.6903756260871887, + "num_tokens": 74964265.0, + "step": 2950 + }, + { + "epoch": 0.32407204041291454, + "grad_norm": 2.5911288261413574, + "learning_rate": 1e-06, + "loss": 1.0271, + "mean_token_accuracy": 0.6910397410392761, + "num_tokens": 74983745.0, + "step": 2951 + }, + { + "epoch": 0.32418185811552824, + "grad_norm": 2.5661864280700684, + "learning_rate": 1e-06, + "loss": 1.0214, + "mean_token_accuracy": 0.6955768465995789, + "num_tokens": 75004229.0, + "step": 2952 + }, + { + "epoch": 0.3242916758181419, + "grad_norm": 2.0959742069244385, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7065303325653076, + "num_tokens": 75033977.0, + "step": 2953 + }, + { + "epoch": 0.32440149352075554, + "grad_norm": 2.2415575981140137, + "learning_rate": 1e-06, + "loss": 1.0433, + "mean_token_accuracy": 0.681246280670166, + "num_tokens": 75061382.0, + "step": 2954 + }, + { + "epoch": 0.3245113112233692, + "grad_norm": 2.090512752532959, + "learning_rate": 1e-06, + "loss": 0.9736, + "mean_token_accuracy": 0.7062071561813354, + "num_tokens": 75091079.0, + "step": 2955 + }, + { + "epoch": 0.3246211289259829, + "grad_norm": 2.1190624237060547, + "learning_rate": 1e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.7091439366340637, + "num_tokens": 75118286.0, + "step": 2956 + }, + { + "epoch": 0.32473094662859653, + "grad_norm": 1.9508453607559204, + "learning_rate": 1e-06, + "loss": 1.0673, + "mean_token_accuracy": 0.6889485120773315, + "num_tokens": 75147949.0, + "step": 2957 + }, + { + "epoch": 0.3248407643312102, + "grad_norm": 2.1248724460601807, + "learning_rate": 1e-06, + "loss": 1.082, + "mean_token_accuracy": 0.6762803792953491, + "num_tokens": 75177084.0, + "step": 2958 + }, + { + "epoch": 0.3249505820338239, + "grad_norm": 2.506592273712158, + "learning_rate": 1e-06, + "loss": 0.9697, + "mean_token_accuracy": 0.7042397856712341, + "num_tokens": 75196926.0, + "step": 2959 + }, + { + "epoch": 0.3250603997364375, + "grad_norm": 2.288144588470459, + "learning_rate": 1e-06, + "loss": 0.984, + "mean_token_accuracy": 0.7062325477600098, + "num_tokens": 75220758.0, + "step": 2960 + }, + { + "epoch": 0.32517021743905117, + "grad_norm": 2.573072671890259, + "learning_rate": 1e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.7013658881187439, + "num_tokens": 75240707.0, + "step": 2961 + }, + { + "epoch": 0.3252800351416648, + "grad_norm": 1.9786559343338013, + "learning_rate": 1e-06, + "loss": 1.0242, + "mean_token_accuracy": 0.6959707736968994, + "num_tokens": 75270382.0, + "step": 2962 + }, + { + "epoch": 0.3253898528442785, + "grad_norm": 2.1795127391815186, + "learning_rate": 1e-06, + "loss": 1.0066, + "mean_token_accuracy": 0.6991334557533264, + "num_tokens": 75295676.0, + "step": 2963 + }, + { + "epoch": 0.32549967054689216, + "grad_norm": 2.068419933319092, + "learning_rate": 1e-06, + "loss": 1.0196, + "mean_token_accuracy": 0.6892741322517395, + "num_tokens": 75323937.0, + "step": 2964 + }, + { + "epoch": 0.3256094882495058, + "grad_norm": 2.220183849334717, + "learning_rate": 1e-06, + "loss": 1.0277, + "mean_token_accuracy": 0.6869233846664429, + "num_tokens": 75350422.0, + "step": 2965 + }, + { + "epoch": 0.3257193059521195, + "grad_norm": 2.024779796600342, + "learning_rate": 1e-06, + "loss": 1.0711, + "mean_token_accuracy": 0.6861697435379028, + "num_tokens": 75381576.0, + "step": 2966 + }, + { + "epoch": 0.32582912365473315, + "grad_norm": 2.2101635932922363, + "learning_rate": 1e-06, + "loss": 0.8962, + "mean_token_accuracy": 0.7300363183021545, + "num_tokens": 75406245.0, + "step": 2967 + }, + { + "epoch": 0.3259389413573468, + "grad_norm": 2.2312211990356445, + "learning_rate": 1e-06, + "loss": 0.8573, + "mean_token_accuracy": 0.7398710250854492, + "num_tokens": 75431485.0, + "step": 2968 + }, + { + "epoch": 0.32604875905996045, + "grad_norm": 2.2033395767211914, + "learning_rate": 1e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.7180484533309937, + "num_tokens": 75456331.0, + "step": 2969 + }, + { + "epoch": 0.32615857676257415, + "grad_norm": 2.1257266998291016, + "learning_rate": 1e-06, + "loss": 0.9916, + "mean_token_accuracy": 0.7098687887191772, + "num_tokens": 75483914.0, + "step": 2970 + }, + { + "epoch": 0.3262683944651878, + "grad_norm": 2.2588841915130615, + "learning_rate": 1e-06, + "loss": 0.8816, + "mean_token_accuracy": 0.7212486267089844, + "num_tokens": 75506378.0, + "step": 2971 + }, + { + "epoch": 0.32637821216780144, + "grad_norm": 2.507701873779297, + "learning_rate": 1e-06, + "loss": 0.9992, + "mean_token_accuracy": 0.6979343891143799, + "num_tokens": 75527145.0, + "step": 2972 + }, + { + "epoch": 0.3264880298704151, + "grad_norm": 2.1371703147888184, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.708719789981842, + "num_tokens": 75553516.0, + "step": 2973 + }, + { + "epoch": 0.3265978475730288, + "grad_norm": 2.305295467376709, + "learning_rate": 1e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.6967329978942871, + "num_tokens": 75575328.0, + "step": 2974 + }, + { + "epoch": 0.32670766527564243, + "grad_norm": 2.1871144771575928, + "learning_rate": 1e-06, + "loss": 1.0063, + "mean_token_accuracy": 0.6979445815086365, + "num_tokens": 75601339.0, + "step": 2975 + }, + { + "epoch": 0.3268174829782561, + "grad_norm": 2.729701519012451, + "learning_rate": 1e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.6911993026733398, + "num_tokens": 75621571.0, + "step": 2976 + }, + { + "epoch": 0.3269273006808698, + "grad_norm": 2.418888568878174, + "learning_rate": 1e-06, + "loss": 1.0386, + "mean_token_accuracy": 0.6925386786460876, + "num_tokens": 75644461.0, + "step": 2977 + }, + { + "epoch": 0.3270371183834834, + "grad_norm": 2.297926425933838, + "learning_rate": 1e-06, + "loss": 1.016, + "mean_token_accuracy": 0.7016284465789795, + "num_tokens": 75670611.0, + "step": 2978 + }, + { + "epoch": 0.3271469360860971, + "grad_norm": 2.4207284450531006, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.6970169544219971, + "num_tokens": 75691356.0, + "step": 2979 + }, + { + "epoch": 0.3272567537887107, + "grad_norm": 2.407132625579834, + "learning_rate": 1e-06, + "loss": 1.0274, + "mean_token_accuracy": 0.689032256603241, + "num_tokens": 75714152.0, + "step": 2980 + }, + { + "epoch": 0.3273665714913244, + "grad_norm": 2.2254655361175537, + "learning_rate": 1e-06, + "loss": 0.98, + "mean_token_accuracy": 0.699029266834259, + "num_tokens": 75738716.0, + "step": 2981 + }, + { + "epoch": 0.32747638919393807, + "grad_norm": 2.1751842498779297, + "learning_rate": 1e-06, + "loss": 1.0144, + "mean_token_accuracy": 0.70245760679245, + "num_tokens": 75765157.0, + "step": 2982 + }, + { + "epoch": 0.3275862068965517, + "grad_norm": 1.9804258346557617, + "learning_rate": 1e-06, + "loss": 1.0611, + "mean_token_accuracy": 0.6798408031463623, + "num_tokens": 75797964.0, + "step": 2983 + }, + { + "epoch": 0.32769602459916536, + "grad_norm": 2.3151941299438477, + "learning_rate": 1e-06, + "loss": 1.0411, + "mean_token_accuracy": 0.6853895783424377, + "num_tokens": 75821836.0, + "step": 2984 + }, + { + "epoch": 0.32780584230177906, + "grad_norm": 2.2454960346221924, + "learning_rate": 1e-06, + "loss": 1.0625, + "mean_token_accuracy": 0.6853616237640381, + "num_tokens": 75846588.0, + "step": 2985 + }, + { + "epoch": 0.3279156600043927, + "grad_norm": 2.206387519836426, + "learning_rate": 1e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7063389420509338, + "num_tokens": 75870594.0, + "step": 2986 + }, + { + "epoch": 0.32802547770700635, + "grad_norm": 2.0439414978027344, + "learning_rate": 1e-06, + "loss": 1.0618, + "mean_token_accuracy": 0.685450553894043, + "num_tokens": 75901459.0, + "step": 2987 + }, + { + "epoch": 0.32813529540962005, + "grad_norm": 2.217062473297119, + "learning_rate": 1e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.7007861137390137, + "num_tokens": 75926189.0, + "step": 2988 + }, + { + "epoch": 0.3282451131122337, + "grad_norm": 2.4392592906951904, + "learning_rate": 1e-06, + "loss": 1.0418, + "mean_token_accuracy": 0.6874527931213379, + "num_tokens": 75949188.0, + "step": 2989 + }, + { + "epoch": 0.32835493081484735, + "grad_norm": 2.1734328269958496, + "learning_rate": 1e-06, + "loss": 1.0504, + "mean_token_accuracy": 0.6901090741157532, + "num_tokens": 75973815.0, + "step": 2990 + }, + { + "epoch": 0.328464748517461, + "grad_norm": 2.2791359424591064, + "learning_rate": 1e-06, + "loss": 1.0847, + "mean_token_accuracy": 0.6773691773414612, + "num_tokens": 75997779.0, + "step": 2991 + }, + { + "epoch": 0.3285745662200747, + "grad_norm": 2.2263565063476562, + "learning_rate": 1e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.706585705280304, + "num_tokens": 76022276.0, + "step": 2992 + }, + { + "epoch": 0.32868438392268834, + "grad_norm": 2.3361454010009766, + "learning_rate": 1e-06, + "loss": 0.8688, + "mean_token_accuracy": 0.7247875332832336, + "num_tokens": 76043133.0, + "step": 2993 + }, + { + "epoch": 0.328794201625302, + "grad_norm": 2.2128257751464844, + "learning_rate": 1e-06, + "loss": 1.0797, + "mean_token_accuracy": 0.6793224811553955, + "num_tokens": 76070844.0, + "step": 2994 + }, + { + "epoch": 0.3289040193279157, + "grad_norm": 2.187896251678467, + "learning_rate": 1e-06, + "loss": 0.9567, + "mean_token_accuracy": 0.7074610590934753, + "num_tokens": 76096258.0, + "step": 2995 + }, + { + "epoch": 0.32901383703052933, + "grad_norm": 2.1593024730682373, + "learning_rate": 1e-06, + "loss": 1.0126, + "mean_token_accuracy": 0.6917059421539307, + "num_tokens": 76123593.0, + "step": 2996 + }, + { + "epoch": 0.329123654733143, + "grad_norm": 2.14467191696167, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.7027033567428589, + "num_tokens": 76150410.0, + "step": 2997 + }, + { + "epoch": 0.3292334724357566, + "grad_norm": 2.29156494140625, + "learning_rate": 1e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.6888339519500732, + "num_tokens": 76173684.0, + "step": 2998 + }, + { + "epoch": 0.3293432901383703, + "grad_norm": 2.5566916465759277, + "learning_rate": 1e-06, + "loss": 0.9931, + "mean_token_accuracy": 0.6977189183235168, + "num_tokens": 76196307.0, + "step": 2999 + }, + { + "epoch": 0.32945310784098397, + "grad_norm": 2.757275342941284, + "learning_rate": 1e-06, + "loss": 0.9332, + "mean_token_accuracy": 0.7089439034461975, + "num_tokens": 76215053.0, + "step": 3000 + }, + { + "epoch": 0.3295629255435976, + "grad_norm": 2.2695658206939697, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7070399522781372, + "num_tokens": 76239301.0, + "step": 3001 + }, + { + "epoch": 0.32967274324621126, + "grad_norm": 2.0305731296539307, + "learning_rate": 1e-06, + "loss": 1.0378, + "mean_token_accuracy": 0.6853392124176025, + "num_tokens": 76268761.0, + "step": 3002 + }, + { + "epoch": 0.32978256094882497, + "grad_norm": 2.285661220550537, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7131178975105286, + "num_tokens": 76294732.0, + "step": 3003 + }, + { + "epoch": 0.3298923786514386, + "grad_norm": 2.2596118450164795, + "learning_rate": 1e-06, + "loss": 1.0347, + "mean_token_accuracy": 0.6889069080352783, + "num_tokens": 76319248.0, + "step": 3004 + }, + { + "epoch": 0.33000219635405226, + "grad_norm": 2.0618057250976562, + "learning_rate": 1e-06, + "loss": 1.0501, + "mean_token_accuracy": 0.7004793882369995, + "num_tokens": 76347216.0, + "step": 3005 + }, + { + "epoch": 0.33011201405666596, + "grad_norm": 2.201465606689453, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.6981731653213501, + "num_tokens": 76374671.0, + "step": 3006 + }, + { + "epoch": 0.3302218317592796, + "grad_norm": 2.308894157409668, + "learning_rate": 1e-06, + "loss": 0.9882, + "mean_token_accuracy": 0.698078989982605, + "num_tokens": 76397262.0, + "step": 3007 + }, + { + "epoch": 0.33033164946189325, + "grad_norm": 2.193208694458008, + "learning_rate": 1e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.7105939388275146, + "num_tokens": 76424435.0, + "step": 3008 + }, + { + "epoch": 0.3304414671645069, + "grad_norm": 2.4442481994628906, + "learning_rate": 1e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.7031793594360352, + "num_tokens": 76446542.0, + "step": 3009 + }, + { + "epoch": 0.3305512848671206, + "grad_norm": 2.078144073486328, + "learning_rate": 1e-06, + "loss": 0.9987, + "mean_token_accuracy": 0.696103572845459, + "num_tokens": 76475163.0, + "step": 3010 + }, + { + "epoch": 0.33066110256973424, + "grad_norm": 2.3437159061431885, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.6962612867355347, + "num_tokens": 76498746.0, + "step": 3011 + }, + { + "epoch": 0.3307709202723479, + "grad_norm": 2.2806482315063477, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.6987780332565308, + "num_tokens": 76525357.0, + "step": 3012 + }, + { + "epoch": 0.33088073797496154, + "grad_norm": 2.620342969894409, + "learning_rate": 1e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.7013574838638306, + "num_tokens": 76545262.0, + "step": 3013 + }, + { + "epoch": 0.33099055567757524, + "grad_norm": 2.4360527992248535, + "learning_rate": 1e-06, + "loss": 0.9841, + "mean_token_accuracy": 0.7033207416534424, + "num_tokens": 76566911.0, + "step": 3014 + }, + { + "epoch": 0.3311003733801889, + "grad_norm": 2.209322690963745, + "learning_rate": 1e-06, + "loss": 0.8532, + "mean_token_accuracy": 0.7338488698005676, + "num_tokens": 76589862.0, + "step": 3015 + }, + { + "epoch": 0.33121019108280253, + "grad_norm": 2.2429492473602295, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.6960464715957642, + "num_tokens": 76616151.0, + "step": 3016 + }, + { + "epoch": 0.33132000878541623, + "grad_norm": 2.4618935585021973, + "learning_rate": 1e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.7234309911727905, + "num_tokens": 76638768.0, + "step": 3017 + }, + { + "epoch": 0.3314298264880299, + "grad_norm": 2.2880637645721436, + "learning_rate": 1e-06, + "loss": 0.9966, + "mean_token_accuracy": 0.7018468379974365, + "num_tokens": 76664245.0, + "step": 3018 + }, + { + "epoch": 0.3315396441906435, + "grad_norm": 2.153639554977417, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7114920616149902, + "num_tokens": 76691930.0, + "step": 3019 + }, + { + "epoch": 0.33164946189325717, + "grad_norm": 2.2456159591674805, + "learning_rate": 1e-06, + "loss": 1.0298, + "mean_token_accuracy": 0.6930941343307495, + "num_tokens": 76719771.0, + "step": 3020 + }, + { + "epoch": 0.33175927959587087, + "grad_norm": 2.1472134590148926, + "learning_rate": 1e-06, + "loss": 1.0216, + "mean_token_accuracy": 0.6929610967636108, + "num_tokens": 76747411.0, + "step": 3021 + }, + { + "epoch": 0.3318690972984845, + "grad_norm": 2.255894422531128, + "learning_rate": 1e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.7114275097846985, + "num_tokens": 76772576.0, + "step": 3022 + }, + { + "epoch": 0.33197891500109816, + "grad_norm": 2.296245574951172, + "learning_rate": 1e-06, + "loss": 1.031, + "mean_token_accuracy": 0.6882835626602173, + "num_tokens": 76798005.0, + "step": 3023 + }, + { + "epoch": 0.33208873270371186, + "grad_norm": 2.3623716831207275, + "learning_rate": 1e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.7040001153945923, + "num_tokens": 76823530.0, + "step": 3024 + }, + { + "epoch": 0.3321985504063255, + "grad_norm": 2.1904776096343994, + "learning_rate": 1e-06, + "loss": 1.058, + "mean_token_accuracy": 0.6844162940979004, + "num_tokens": 76851873.0, + "step": 3025 + }, + { + "epoch": 0.33230836810893916, + "grad_norm": 2.0464515686035156, + "learning_rate": 1e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.7014604806900024, + "num_tokens": 76878714.0, + "step": 3026 + }, + { + "epoch": 0.3324181858115528, + "grad_norm": 2.253268241882324, + "learning_rate": 1e-06, + "loss": 0.9949, + "mean_token_accuracy": 0.6990002989768982, + "num_tokens": 76904201.0, + "step": 3027 + }, + { + "epoch": 0.3325280035141665, + "grad_norm": 2.154352903366089, + "learning_rate": 1e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.7097886204719543, + "num_tokens": 76930932.0, + "step": 3028 + }, + { + "epoch": 0.33263782121678015, + "grad_norm": 1.8854035139083862, + "learning_rate": 1e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.702495813369751, + "num_tokens": 76964866.0, + "step": 3029 + }, + { + "epoch": 0.3327476389193938, + "grad_norm": 2.2269985675811768, + "learning_rate": 1e-06, + "loss": 1.0623, + "mean_token_accuracy": 0.6875736713409424, + "num_tokens": 76992083.0, + "step": 3030 + }, + { + "epoch": 0.33285745662200744, + "grad_norm": 2.3026540279388428, + "learning_rate": 1e-06, + "loss": 0.993, + "mean_token_accuracy": 0.7000293135643005, + "num_tokens": 77015565.0, + "step": 3031 + }, + { + "epoch": 0.33296727432462114, + "grad_norm": 2.1660561561584473, + "learning_rate": 1e-06, + "loss": 1.0463, + "mean_token_accuracy": 0.6840284466743469, + "num_tokens": 77043145.0, + "step": 3032 + }, + { + "epoch": 0.3330770920272348, + "grad_norm": 2.0512712001800537, + "learning_rate": 1e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.7009378671646118, + "num_tokens": 77073394.0, + "step": 3033 + }, + { + "epoch": 0.33318690972984844, + "grad_norm": 2.314242124557495, + "learning_rate": 1e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.6986080408096313, + "num_tokens": 77098597.0, + "step": 3034 + }, + { + "epoch": 0.33329672743246214, + "grad_norm": 2.0830609798431396, + "learning_rate": 1e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.6953683495521545, + "num_tokens": 77126667.0, + "step": 3035 + }, + { + "epoch": 0.3334065451350758, + "grad_norm": 2.1053972244262695, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7103772163391113, + "num_tokens": 77154977.0, + "step": 3036 + }, + { + "epoch": 0.33351636283768943, + "grad_norm": 2.2665088176727295, + "learning_rate": 1e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.7042458653450012, + "num_tokens": 77179684.0, + "step": 3037 + }, + { + "epoch": 0.3336261805403031, + "grad_norm": 2.436356544494629, + "learning_rate": 1e-06, + "loss": 0.9944, + "mean_token_accuracy": 0.7002869844436646, + "num_tokens": 77202710.0, + "step": 3038 + }, + { + "epoch": 0.3337359982429168, + "grad_norm": 2.4834911823272705, + "learning_rate": 1e-06, + "loss": 1.0082, + "mean_token_accuracy": 0.6979823708534241, + "num_tokens": 77224163.0, + "step": 3039 + }, + { + "epoch": 0.3338458159455304, + "grad_norm": 2.1182897090911865, + "learning_rate": 1e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.7017713189125061, + "num_tokens": 77254357.0, + "step": 3040 + }, + { + "epoch": 0.33395563364814407, + "grad_norm": 1.9681333303451538, + "learning_rate": 1e-06, + "loss": 0.985, + "mean_token_accuracy": 0.6995910406112671, + "num_tokens": 77286224.0, + "step": 3041 + }, + { + "epoch": 0.33406545135075777, + "grad_norm": 2.0785655975341797, + "learning_rate": 1e-06, + "loss": 1.0985, + "mean_token_accuracy": 0.6699164509773254, + "num_tokens": 77314446.0, + "step": 3042 + }, + { + "epoch": 0.3341752690533714, + "grad_norm": 2.3323252201080322, + "learning_rate": 1e-06, + "loss": 1.0584, + "mean_token_accuracy": 0.6865000128746033, + "num_tokens": 77341596.0, + "step": 3043 + }, + { + "epoch": 0.33428508675598506, + "grad_norm": 2.007077693939209, + "learning_rate": 1e-06, + "loss": 1.0446, + "mean_token_accuracy": 0.687399685382843, + "num_tokens": 77373522.0, + "step": 3044 + }, + { + "epoch": 0.3343949044585987, + "grad_norm": 2.3507134914398193, + "learning_rate": 1e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.6975351572036743, + "num_tokens": 77396863.0, + "step": 3045 + }, + { + "epoch": 0.3345047221612124, + "grad_norm": 2.6495585441589355, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.7034255266189575, + "num_tokens": 77416354.0, + "step": 3046 + }, + { + "epoch": 0.33461453986382605, + "grad_norm": 2.308190107345581, + "learning_rate": 1e-06, + "loss": 1.0233, + "mean_token_accuracy": 0.6922033429145813, + "num_tokens": 77445483.0, + "step": 3047 + }, + { + "epoch": 0.3347243575664397, + "grad_norm": 2.4077935218811035, + "learning_rate": 1e-06, + "loss": 0.9122, + "mean_token_accuracy": 0.7241799831390381, + "num_tokens": 77467812.0, + "step": 3048 + }, + { + "epoch": 0.33483417526905335, + "grad_norm": 2.2005531787872314, + "learning_rate": 1e-06, + "loss": 1.0541, + "mean_token_accuracy": 0.6774933934211731, + "num_tokens": 77492796.0, + "step": 3049 + }, + { + "epoch": 0.33494399297166705, + "grad_norm": 2.001870632171631, + "learning_rate": 1e-06, + "loss": 1.0125, + "mean_token_accuracy": 0.6885014772415161, + "num_tokens": 77523947.0, + "step": 3050 + }, + { + "epoch": 0.3350538106742807, + "grad_norm": 2.416459798812866, + "learning_rate": 1e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.7246174812316895, + "num_tokens": 77545473.0, + "step": 3051 + }, + { + "epoch": 0.33516362837689434, + "grad_norm": 2.31284236907959, + "learning_rate": 1e-06, + "loss": 1.0314, + "mean_token_accuracy": 0.6903645396232605, + "num_tokens": 77569215.0, + "step": 3052 + }, + { + "epoch": 0.33527344607950804, + "grad_norm": 2.109985589981079, + "learning_rate": 1e-06, + "loss": 1.0612, + "mean_token_accuracy": 0.687382698059082, + "num_tokens": 77597609.0, + "step": 3053 + }, + { + "epoch": 0.3353832637821217, + "grad_norm": 2.2928099632263184, + "learning_rate": 1e-06, + "loss": 0.9955, + "mean_token_accuracy": 0.6969040036201477, + "num_tokens": 77620780.0, + "step": 3054 + }, + { + "epoch": 0.33549308148473533, + "grad_norm": 2.138925313949585, + "learning_rate": 1e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7158210873603821, + "num_tokens": 77646634.0, + "step": 3055 + }, + { + "epoch": 0.335602899187349, + "grad_norm": 2.4572253227233887, + "learning_rate": 1e-06, + "loss": 0.897, + "mean_token_accuracy": 0.7299551367759705, + "num_tokens": 77671166.0, + "step": 3056 + }, + { + "epoch": 0.3357127168899627, + "grad_norm": 2.0927894115448, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7129402756690979, + "num_tokens": 77698268.0, + "step": 3057 + }, + { + "epoch": 0.3358225345925763, + "grad_norm": 2.0249123573303223, + "learning_rate": 1e-06, + "loss": 1.0225, + "mean_token_accuracy": 0.6914882659912109, + "num_tokens": 77728436.0, + "step": 3058 + }, + { + "epoch": 0.33593235229519, + "grad_norm": 2.336686134338379, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.7079216241836548, + "num_tokens": 77751660.0, + "step": 3059 + }, + { + "epoch": 0.3360421699978036, + "grad_norm": 2.229736804962158, + "learning_rate": 1e-06, + "loss": 1.0309, + "mean_token_accuracy": 0.6917436122894287, + "num_tokens": 77776347.0, + "step": 3060 + }, + { + "epoch": 0.3361519877004173, + "grad_norm": 2.541215658187866, + "learning_rate": 1e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7168881893157959, + "num_tokens": 77797942.0, + "step": 3061 + }, + { + "epoch": 0.33626180540303097, + "grad_norm": 1.9880365133285522, + "learning_rate": 1e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.7063150405883789, + "num_tokens": 77827349.0, + "step": 3062 + }, + { + "epoch": 0.3363716231056446, + "grad_norm": 2.373934745788574, + "learning_rate": 1e-06, + "loss": 1.0577, + "mean_token_accuracy": 0.682866632938385, + "num_tokens": 77850981.0, + "step": 3063 + }, + { + "epoch": 0.3364814408082583, + "grad_norm": 2.292529344558716, + "learning_rate": 1e-06, + "loss": 1.0215, + "mean_token_accuracy": 0.6856207847595215, + "num_tokens": 77875371.0, + "step": 3064 + }, + { + "epoch": 0.33659125851087196, + "grad_norm": 1.9303431510925293, + "learning_rate": 1e-06, + "loss": 1.0458, + "mean_token_accuracy": 0.6836824417114258, + "num_tokens": 77906166.0, + "step": 3065 + }, + { + "epoch": 0.3367010762134856, + "grad_norm": 2.315173387527466, + "learning_rate": 1e-06, + "loss": 1.0295, + "mean_token_accuracy": 0.6957937479019165, + "num_tokens": 77936009.0, + "step": 3066 + }, + { + "epoch": 0.33681089391609925, + "grad_norm": 2.2288389205932617, + "learning_rate": 1e-06, + "loss": 1.0454, + "mean_token_accuracy": 0.6872841119766235, + "num_tokens": 77960921.0, + "step": 3067 + }, + { + "epoch": 0.33692071161871295, + "grad_norm": 2.4282524585723877, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7094231247901917, + "num_tokens": 77982243.0, + "step": 3068 + }, + { + "epoch": 0.3370305293213266, + "grad_norm": 2.1781630516052246, + "learning_rate": 1e-06, + "loss": 0.9158, + "mean_token_accuracy": 0.7187603712081909, + "num_tokens": 78008896.0, + "step": 3069 + }, + { + "epoch": 0.33714034702394025, + "grad_norm": 2.295651435852051, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7262872457504272, + "num_tokens": 78031574.0, + "step": 3070 + }, + { + "epoch": 0.33725016472655395, + "grad_norm": 2.3599932193756104, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.6951069831848145, + "num_tokens": 78055668.0, + "step": 3071 + }, + { + "epoch": 0.3373599824291676, + "grad_norm": 2.1367011070251465, + "learning_rate": 1e-06, + "loss": 1.1029, + "mean_token_accuracy": 0.6685352921485901, + "num_tokens": 78085960.0, + "step": 3072 + }, + { + "epoch": 0.33746980013178124, + "grad_norm": 2.1525089740753174, + "learning_rate": 1e-06, + "loss": 1.0984, + "mean_token_accuracy": 0.678193211555481, + "num_tokens": 78113170.0, + "step": 3073 + }, + { + "epoch": 0.3375796178343949, + "grad_norm": 2.7072086334228516, + "learning_rate": 1e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.7195205688476562, + "num_tokens": 78133540.0, + "step": 3074 + }, + { + "epoch": 0.3376894355370086, + "grad_norm": 2.0142881870269775, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7188939452171326, + "num_tokens": 78162873.0, + "step": 3075 + }, + { + "epoch": 0.33779925323962223, + "grad_norm": 2.0846025943756104, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.7031610012054443, + "num_tokens": 78189583.0, + "step": 3076 + }, + { + "epoch": 0.3379090709422359, + "grad_norm": 2.304591417312622, + "learning_rate": 1e-06, + "loss": 1.0223, + "mean_token_accuracy": 0.6929634809494019, + "num_tokens": 78217920.0, + "step": 3077 + }, + { + "epoch": 0.3380188886448495, + "grad_norm": 2.2151780128479004, + "learning_rate": 1e-06, + "loss": 0.9719, + "mean_token_accuracy": 0.697076141834259, + "num_tokens": 78244644.0, + "step": 3078 + }, + { + "epoch": 0.3381287063474632, + "grad_norm": 2.308668613433838, + "learning_rate": 1e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.7181296944618225, + "num_tokens": 78266675.0, + "step": 3079 + }, + { + "epoch": 0.33823852405007687, + "grad_norm": 2.321681022644043, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7038134336471558, + "num_tokens": 78290594.0, + "step": 3080 + }, + { + "epoch": 0.3383483417526905, + "grad_norm": 2.2509522438049316, + "learning_rate": 1e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.6950193643569946, + "num_tokens": 78316025.0, + "step": 3081 + }, + { + "epoch": 0.3384581594553042, + "grad_norm": 2.551565647125244, + "learning_rate": 1e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.6936882138252258, + "num_tokens": 78336001.0, + "step": 3082 + }, + { + "epoch": 0.33856797715791787, + "grad_norm": 2.098647356033325, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7131535410881042, + "num_tokens": 78365768.0, + "step": 3083 + }, + { + "epoch": 0.3386777948605315, + "grad_norm": 2.00301456451416, + "learning_rate": 1e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7072017192840576, + "num_tokens": 78395940.0, + "step": 3084 + }, + { + "epoch": 0.33878761256314516, + "grad_norm": 2.287262201309204, + "learning_rate": 1e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.7047128677368164, + "num_tokens": 78418956.0, + "step": 3085 + }, + { + "epoch": 0.33889743026575886, + "grad_norm": 2.146291494369507, + "learning_rate": 1e-06, + "loss": 0.9933, + "mean_token_accuracy": 0.7006222009658813, + "num_tokens": 78446035.0, + "step": 3086 + }, + { + "epoch": 0.3390072479683725, + "grad_norm": 2.4490067958831787, + "learning_rate": 1e-06, + "loss": 1.0786, + "mean_token_accuracy": 0.6834028363227844, + "num_tokens": 78468674.0, + "step": 3087 + }, + { + "epoch": 0.33911706567098615, + "grad_norm": 2.2322137355804443, + "learning_rate": 1e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7183877229690552, + "num_tokens": 78494327.0, + "step": 3088 + }, + { + "epoch": 0.3392268833735998, + "grad_norm": 2.2395174503326416, + "learning_rate": 1e-06, + "loss": 0.9802, + "mean_token_accuracy": 0.7041308879852295, + "num_tokens": 78520463.0, + "step": 3089 + }, + { + "epoch": 0.3393367010762135, + "grad_norm": 2.4625232219696045, + "learning_rate": 1e-06, + "loss": 1.018, + "mean_token_accuracy": 0.691620945930481, + "num_tokens": 78540747.0, + "step": 3090 + }, + { + "epoch": 0.33944651877882714, + "grad_norm": 2.252168893814087, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7149121761322021, + "num_tokens": 78566733.0, + "step": 3091 + }, + { + "epoch": 0.3395563364814408, + "grad_norm": 2.2491042613983154, + "learning_rate": 1e-06, + "loss": 0.9837, + "mean_token_accuracy": 0.6953563690185547, + "num_tokens": 78590242.0, + "step": 3092 + }, + { + "epoch": 0.3396661541840545, + "grad_norm": 2.2012441158294678, + "learning_rate": 1e-06, + "loss": 1.0095, + "mean_token_accuracy": 0.6938081383705139, + "num_tokens": 78615871.0, + "step": 3093 + }, + { + "epoch": 0.33977597188666814, + "grad_norm": 2.09049129486084, + "learning_rate": 1e-06, + "loss": 0.9882, + "mean_token_accuracy": 0.7115762233734131, + "num_tokens": 78642888.0, + "step": 3094 + }, + { + "epoch": 0.3398857895892818, + "grad_norm": 2.397772789001465, + "learning_rate": 1e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.7067288756370544, + "num_tokens": 78663783.0, + "step": 3095 + }, + { + "epoch": 0.33999560729189543, + "grad_norm": 2.2270162105560303, + "learning_rate": 1e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.703100323677063, + "num_tokens": 78689060.0, + "step": 3096 + }, + { + "epoch": 0.34010542499450913, + "grad_norm": 2.4856607913970947, + "learning_rate": 1e-06, + "loss": 1.0201, + "mean_token_accuracy": 0.6976507902145386, + "num_tokens": 78712900.0, + "step": 3097 + }, + { + "epoch": 0.3402152426971228, + "grad_norm": 2.4618160724639893, + "learning_rate": 1e-06, + "loss": 1.041, + "mean_token_accuracy": 0.6816050410270691, + "num_tokens": 78735444.0, + "step": 3098 + }, + { + "epoch": 0.3403250603997364, + "grad_norm": 1.9673084020614624, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.7069017291069031, + "num_tokens": 78768846.0, + "step": 3099 + }, + { + "epoch": 0.3404348781023501, + "grad_norm": 2.012399196624756, + "learning_rate": 1e-06, + "loss": 1.1539, + "mean_token_accuracy": 0.6551059484481812, + "num_tokens": 78801129.0, + "step": 3100 + }, + { + "epoch": 0.34054469580496377, + "grad_norm": 2.314878225326538, + "learning_rate": 1e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.7137011885643005, + "num_tokens": 78824488.0, + "step": 3101 + }, + { + "epoch": 0.3406545135075774, + "grad_norm": 2.2991654872894287, + "learning_rate": 1e-06, + "loss": 1.0549, + "mean_token_accuracy": 0.6838697791099548, + "num_tokens": 78847800.0, + "step": 3102 + }, + { + "epoch": 0.34076433121019106, + "grad_norm": 2.268977165222168, + "learning_rate": 1e-06, + "loss": 1.013, + "mean_token_accuracy": 0.6916420459747314, + "num_tokens": 78872811.0, + "step": 3103 + }, + { + "epoch": 0.34087414891280476, + "grad_norm": 2.233309507369995, + "learning_rate": 1e-06, + "loss": 1.008, + "mean_token_accuracy": 0.6972110271453857, + "num_tokens": 78896928.0, + "step": 3104 + }, + { + "epoch": 0.3409839666154184, + "grad_norm": 2.218409299850464, + "learning_rate": 1e-06, + "loss": 1.0239, + "mean_token_accuracy": 0.6932320594787598, + "num_tokens": 78922654.0, + "step": 3105 + }, + { + "epoch": 0.34109378431803206, + "grad_norm": 2.5543649196624756, + "learning_rate": 1e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.7420464754104614, + "num_tokens": 78941289.0, + "step": 3106 + }, + { + "epoch": 0.3412036020206457, + "grad_norm": 2.3112478256225586, + "learning_rate": 1e-06, + "loss": 0.9936, + "mean_token_accuracy": 0.7019516229629517, + "num_tokens": 78965602.0, + "step": 3107 + }, + { + "epoch": 0.3413134197232594, + "grad_norm": 2.360598564147949, + "learning_rate": 1e-06, + "loss": 1.0779, + "mean_token_accuracy": 0.6769358515739441, + "num_tokens": 78992318.0, + "step": 3108 + }, + { + "epoch": 0.34142323742587305, + "grad_norm": 2.2780609130859375, + "learning_rate": 1e-06, + "loss": 1.0225, + "mean_token_accuracy": 0.6904418468475342, + "num_tokens": 79018113.0, + "step": 3109 + }, + { + "epoch": 0.3415330551284867, + "grad_norm": 2.0050415992736816, + "learning_rate": 1e-06, + "loss": 1.1199, + "mean_token_accuracy": 0.6649218797683716, + "num_tokens": 79049652.0, + "step": 3110 + }, + { + "epoch": 0.3416428728311004, + "grad_norm": 2.3091135025024414, + "learning_rate": 1e-06, + "loss": 1.1051, + "mean_token_accuracy": 0.669671893119812, + "num_tokens": 79073498.0, + "step": 3111 + }, + { + "epoch": 0.34175269053371404, + "grad_norm": 2.354889392852783, + "learning_rate": 1e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.7202717065811157, + "num_tokens": 79094959.0, + "step": 3112 + }, + { + "epoch": 0.3418625082363277, + "grad_norm": 2.137766122817993, + "learning_rate": 1e-06, + "loss": 1.0127, + "mean_token_accuracy": 0.6872434616088867, + "num_tokens": 79123823.0, + "step": 3113 + }, + { + "epoch": 0.34197232593894134, + "grad_norm": 2.5702264308929443, + "learning_rate": 1e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.724134087562561, + "num_tokens": 79142357.0, + "step": 3114 + }, + { + "epoch": 0.34208214364155504, + "grad_norm": 2.322232961654663, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7113080024719238, + "num_tokens": 79163436.0, + "step": 3115 + }, + { + "epoch": 0.3421919613441687, + "grad_norm": 2.0934324264526367, + "learning_rate": 1e-06, + "loss": 1.0128, + "mean_token_accuracy": 0.6896845698356628, + "num_tokens": 79190579.0, + "step": 3116 + }, + { + "epoch": 0.34230177904678233, + "grad_norm": 2.0488734245300293, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.6879798173904419, + "num_tokens": 79218505.0, + "step": 3117 + }, + { + "epoch": 0.34241159674939603, + "grad_norm": 2.1613547801971436, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.7088101506233215, + "num_tokens": 79243844.0, + "step": 3118 + }, + { + "epoch": 0.3425214144520097, + "grad_norm": 2.1501550674438477, + "learning_rate": 1e-06, + "loss": 0.9817, + "mean_token_accuracy": 0.6993228197097778, + "num_tokens": 79269160.0, + "step": 3119 + }, + { + "epoch": 0.3426312321546233, + "grad_norm": 2.1749842166900635, + "learning_rate": 1e-06, + "loss": 1.068, + "mean_token_accuracy": 0.6814588308334351, + "num_tokens": 79296113.0, + "step": 3120 + }, + { + "epoch": 0.34274104985723697, + "grad_norm": 1.9539765119552612, + "learning_rate": 1e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.7082120180130005, + "num_tokens": 79327063.0, + "step": 3121 + }, + { + "epoch": 0.34285086755985067, + "grad_norm": 2.3042120933532715, + "learning_rate": 1e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.6970980167388916, + "num_tokens": 79352237.0, + "step": 3122 + }, + { + "epoch": 0.3429606852624643, + "grad_norm": 1.9534372091293335, + "learning_rate": 1e-06, + "loss": 1.0679, + "mean_token_accuracy": 0.6773341298103333, + "num_tokens": 79383746.0, + "step": 3123 + }, + { + "epoch": 0.34307050296507796, + "grad_norm": 2.376337766647339, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7077811360359192, + "num_tokens": 79407492.0, + "step": 3124 + }, + { + "epoch": 0.3431803206676916, + "grad_norm": 1.9865161180496216, + "learning_rate": 1e-06, + "loss": 1.0263, + "mean_token_accuracy": 0.6947591304779053, + "num_tokens": 79439563.0, + "step": 3125 + }, + { + "epoch": 0.3432901383703053, + "grad_norm": 2.4005868434906006, + "learning_rate": 1e-06, + "loss": 1.0122, + "mean_token_accuracy": 0.6976280212402344, + "num_tokens": 79463480.0, + "step": 3126 + }, + { + "epoch": 0.34339995607291895, + "grad_norm": 2.594115734100342, + "learning_rate": 1e-06, + "loss": 0.9282, + "mean_token_accuracy": 0.7129951119422913, + "num_tokens": 79483870.0, + "step": 3127 + }, + { + "epoch": 0.3435097737755326, + "grad_norm": 2.1803431510925293, + "learning_rate": 1e-06, + "loss": 1.0131, + "mean_token_accuracy": 0.6941640377044678, + "num_tokens": 79512874.0, + "step": 3128 + }, + { + "epoch": 0.3436195914781463, + "grad_norm": 2.535830020904541, + "learning_rate": 1e-06, + "loss": 1.0158, + "mean_token_accuracy": 0.7019790410995483, + "num_tokens": 79533191.0, + "step": 3129 + }, + { + "epoch": 0.34372940918075995, + "grad_norm": 2.057708263397217, + "learning_rate": 1e-06, + "loss": 0.99, + "mean_token_accuracy": 0.6964488625526428, + "num_tokens": 79560685.0, + "step": 3130 + }, + { + "epoch": 0.3438392268833736, + "grad_norm": 2.429518222808838, + "learning_rate": 1e-06, + "loss": 1.0802, + "mean_token_accuracy": 0.6773127317428589, + "num_tokens": 79584122.0, + "step": 3131 + }, + { + "epoch": 0.34394904458598724, + "grad_norm": 2.287935495376587, + "learning_rate": 1e-06, + "loss": 1.0617, + "mean_token_accuracy": 0.6844005584716797, + "num_tokens": 79610146.0, + "step": 3132 + }, + { + "epoch": 0.34405886228860094, + "grad_norm": 2.2848048210144043, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.7026692032814026, + "num_tokens": 79633550.0, + "step": 3133 + }, + { + "epoch": 0.3441686799912146, + "grad_norm": 2.118650197982788, + "learning_rate": 1e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.7021934986114502, + "num_tokens": 79659977.0, + "step": 3134 + }, + { + "epoch": 0.34427849769382823, + "grad_norm": 2.486764669418335, + "learning_rate": 1e-06, + "loss": 0.8533, + "mean_token_accuracy": 0.729873538017273, + "num_tokens": 79679419.0, + "step": 3135 + }, + { + "epoch": 0.3443883153964419, + "grad_norm": 2.416076421737671, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.6950803995132446, + "num_tokens": 79702286.0, + "step": 3136 + }, + { + "epoch": 0.3444981330990556, + "grad_norm": 2.635883092880249, + "learning_rate": 1e-06, + "loss": 0.9798, + "mean_token_accuracy": 0.7021350264549255, + "num_tokens": 79722678.0, + "step": 3137 + }, + { + "epoch": 0.3446079508016692, + "grad_norm": 2.408989667892456, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7118991613388062, + "num_tokens": 79743888.0, + "step": 3138 + }, + { + "epoch": 0.3447177685042829, + "grad_norm": 2.03444766998291, + "learning_rate": 1e-06, + "loss": 1.031, + "mean_token_accuracy": 0.6903760433197021, + "num_tokens": 79772209.0, + "step": 3139 + }, + { + "epoch": 0.3448275862068966, + "grad_norm": 2.0988845825195312, + "learning_rate": 1e-06, + "loss": 0.9872, + "mean_token_accuracy": 0.7053666114807129, + "num_tokens": 79800244.0, + "step": 3140 + }, + { + "epoch": 0.3449374039095102, + "grad_norm": 2.3460676670074463, + "learning_rate": 1e-06, + "loss": 1.042, + "mean_token_accuracy": 0.6895116567611694, + "num_tokens": 79824430.0, + "step": 3141 + }, + { + "epoch": 0.34504722161212387, + "grad_norm": 2.1824381351470947, + "learning_rate": 1e-06, + "loss": 1.1081, + "mean_token_accuracy": 0.6650940775871277, + "num_tokens": 79852076.0, + "step": 3142 + }, + { + "epoch": 0.3451570393147375, + "grad_norm": 2.1076159477233887, + "learning_rate": 1e-06, + "loss": 1.0247, + "mean_token_accuracy": 0.6871548891067505, + "num_tokens": 79879072.0, + "step": 3143 + }, + { + "epoch": 0.3452668570173512, + "grad_norm": 2.31103253364563, + "learning_rate": 1e-06, + "loss": 0.8731, + "mean_token_accuracy": 0.7316389083862305, + "num_tokens": 79900744.0, + "step": 3144 + }, + { + "epoch": 0.34537667471996486, + "grad_norm": 2.4033889770507812, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7154610753059387, + "num_tokens": 79921893.0, + "step": 3145 + }, + { + "epoch": 0.3454864924225785, + "grad_norm": 2.2276062965393066, + "learning_rate": 1e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.7240235805511475, + "num_tokens": 79946677.0, + "step": 3146 + }, + { + "epoch": 0.3455963101251922, + "grad_norm": 2.1229779720306396, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7181680202484131, + "num_tokens": 79972739.0, + "step": 3147 + }, + { + "epoch": 0.34570612782780585, + "grad_norm": 2.509516477584839, + "learning_rate": 1e-06, + "loss": 0.859, + "mean_token_accuracy": 0.733087420463562, + "num_tokens": 79992036.0, + "step": 3148 + }, + { + "epoch": 0.3458159455304195, + "grad_norm": 2.11769700050354, + "learning_rate": 1e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.6972953081130981, + "num_tokens": 80019580.0, + "step": 3149 + }, + { + "epoch": 0.34592576323303315, + "grad_norm": 2.2303261756896973, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.7033947706222534, + "num_tokens": 80044739.0, + "step": 3150 + }, + { + "epoch": 0.34603558093564685, + "grad_norm": 2.3776135444641113, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.6986407041549683, + "num_tokens": 80068448.0, + "step": 3151 + }, + { + "epoch": 0.3461453986382605, + "grad_norm": 2.267744779586792, + "learning_rate": 1e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.7013756632804871, + "num_tokens": 80094480.0, + "step": 3152 + }, + { + "epoch": 0.34625521634087414, + "grad_norm": 1.98042631149292, + "learning_rate": 1e-06, + "loss": 1.0368, + "mean_token_accuracy": 0.6902992725372314, + "num_tokens": 80124988.0, + "step": 3153 + }, + { + "epoch": 0.3463650340434878, + "grad_norm": 2.399170160293579, + "learning_rate": 1e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.717749834060669, + "num_tokens": 80147805.0, + "step": 3154 + }, + { + "epoch": 0.3464748517461015, + "grad_norm": 2.256514072418213, + "learning_rate": 1e-06, + "loss": 0.9193, + "mean_token_accuracy": 0.7216647863388062, + "num_tokens": 80172766.0, + "step": 3155 + }, + { + "epoch": 0.34658466944871513, + "grad_norm": 2.2852513790130615, + "learning_rate": 1e-06, + "loss": 1.046, + "mean_token_accuracy": 0.6841272115707397, + "num_tokens": 80197467.0, + "step": 3156 + }, + { + "epoch": 0.3466944871513288, + "grad_norm": 2.016317367553711, + "learning_rate": 1e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.7142617106437683, + "num_tokens": 80226192.0, + "step": 3157 + }, + { + "epoch": 0.3468043048539425, + "grad_norm": 2.5484845638275146, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.6976252794265747, + "num_tokens": 80246530.0, + "step": 3158 + }, + { + "epoch": 0.3469141225565561, + "grad_norm": 2.033971071243286, + "learning_rate": 1e-06, + "loss": 1.0358, + "mean_token_accuracy": 0.6909747123718262, + "num_tokens": 80277104.0, + "step": 3159 + }, + { + "epoch": 0.34702394025916977, + "grad_norm": 2.0345914363861084, + "learning_rate": 1e-06, + "loss": 1.062, + "mean_token_accuracy": 0.6818464398384094, + "num_tokens": 80307953.0, + "step": 3160 + }, + { + "epoch": 0.3471337579617834, + "grad_norm": 2.156999349594116, + "learning_rate": 1e-06, + "loss": 0.979, + "mean_token_accuracy": 0.7043671607971191, + "num_tokens": 80335528.0, + "step": 3161 + }, + { + "epoch": 0.3472435756643971, + "grad_norm": 2.1481387615203857, + "learning_rate": 1e-06, + "loss": 1.0969, + "mean_token_accuracy": 0.6742690205574036, + "num_tokens": 80366387.0, + "step": 3162 + }, + { + "epoch": 0.34735339336701077, + "grad_norm": 1.987593650817871, + "learning_rate": 1e-06, + "loss": 1.0167, + "mean_token_accuracy": 0.6938250660896301, + "num_tokens": 80398498.0, + "step": 3163 + }, + { + "epoch": 0.3474632110696244, + "grad_norm": 2.2976768016815186, + "learning_rate": 1e-06, + "loss": 0.9225, + "mean_token_accuracy": 0.7151904702186584, + "num_tokens": 80421535.0, + "step": 3164 + }, + { + "epoch": 0.34757302877223806, + "grad_norm": 1.9164471626281738, + "learning_rate": 1e-06, + "loss": 0.9991, + "mean_token_accuracy": 0.6945843696594238, + "num_tokens": 80454459.0, + "step": 3165 + }, + { + "epoch": 0.34768284647485176, + "grad_norm": 2.111269235610962, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7069358229637146, + "num_tokens": 80480619.0, + "step": 3166 + }, + { + "epoch": 0.3477926641774654, + "grad_norm": 2.2196669578552246, + "learning_rate": 1e-06, + "loss": 1.043, + "mean_token_accuracy": 0.6860295534133911, + "num_tokens": 80508329.0, + "step": 3167 + }, + { + "epoch": 0.34790248188007905, + "grad_norm": 2.4751720428466797, + "learning_rate": 1e-06, + "loss": 0.9014, + "mean_token_accuracy": 0.7152947783470154, + "num_tokens": 80529135.0, + "step": 3168 + }, + { + "epoch": 0.34801229958269275, + "grad_norm": 2.016153573989868, + "learning_rate": 1e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.7157827615737915, + "num_tokens": 80555866.0, + "step": 3169 + }, + { + "epoch": 0.3481221172853064, + "grad_norm": 2.6137893199920654, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7109411954879761, + "num_tokens": 80574791.0, + "step": 3170 + }, + { + "epoch": 0.34823193498792004, + "grad_norm": 2.301424741744995, + "learning_rate": 1e-06, + "loss": 0.9804, + "mean_token_accuracy": 0.7043020725250244, + "num_tokens": 80597006.0, + "step": 3171 + }, + { + "epoch": 0.3483417526905337, + "grad_norm": 2.3264663219451904, + "learning_rate": 1e-06, + "loss": 0.9262, + "mean_token_accuracy": 0.7127586007118225, + "num_tokens": 80620306.0, + "step": 3172 + }, + { + "epoch": 0.3484515703931474, + "grad_norm": 2.111618757247925, + "learning_rate": 1e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.7224089503288269, + "num_tokens": 80647293.0, + "step": 3173 + }, + { + "epoch": 0.34856138809576104, + "grad_norm": 2.446758508682251, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.7029961347579956, + "num_tokens": 80667891.0, + "step": 3174 + }, + { + "epoch": 0.3486712057983747, + "grad_norm": 2.276120901107788, + "learning_rate": 1e-06, + "loss": 0.9853, + "mean_token_accuracy": 0.7011868953704834, + "num_tokens": 80692381.0, + "step": 3175 + }, + { + "epoch": 0.3487810235009884, + "grad_norm": 2.2142226696014404, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7051170468330383, + "num_tokens": 80716579.0, + "step": 3176 + }, + { + "epoch": 0.34889084120360203, + "grad_norm": 2.422741174697876, + "learning_rate": 1e-06, + "loss": 1.0192, + "mean_token_accuracy": 0.6937460899353027, + "num_tokens": 80738224.0, + "step": 3177 + }, + { + "epoch": 0.3490006589062157, + "grad_norm": 2.141761064529419, + "learning_rate": 1e-06, + "loss": 0.9833, + "mean_token_accuracy": 0.699149489402771, + "num_tokens": 80763771.0, + "step": 3178 + }, + { + "epoch": 0.3491104766088293, + "grad_norm": 2.511030673980713, + "learning_rate": 1e-06, + "loss": 1.0392, + "mean_token_accuracy": 0.6835352778434753, + "num_tokens": 80785258.0, + "step": 3179 + }, + { + "epoch": 0.349220294311443, + "grad_norm": 2.3834900856018066, + "learning_rate": 1e-06, + "loss": 1.0519, + "mean_token_accuracy": 0.6879094839096069, + "num_tokens": 80808541.0, + "step": 3180 + }, + { + "epoch": 0.34933011201405667, + "grad_norm": 2.427725076675415, + "learning_rate": 1e-06, + "loss": 0.9768, + "mean_token_accuracy": 0.7029712200164795, + "num_tokens": 80830047.0, + "step": 3181 + }, + { + "epoch": 0.3494399297166703, + "grad_norm": 2.281247138977051, + "learning_rate": 1e-06, + "loss": 1.0522, + "mean_token_accuracy": 0.6829993724822998, + "num_tokens": 80856118.0, + "step": 3182 + }, + { + "epoch": 0.34954974741928396, + "grad_norm": 2.236077070236206, + "learning_rate": 1e-06, + "loss": 1.038, + "mean_token_accuracy": 0.6919382810592651, + "num_tokens": 80882524.0, + "step": 3183 + }, + { + "epoch": 0.34965956512189766, + "grad_norm": 1.9240708351135254, + "learning_rate": 1e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.6951355934143066, + "num_tokens": 80915164.0, + "step": 3184 + }, + { + "epoch": 0.3497693828245113, + "grad_norm": 2.3928894996643066, + "learning_rate": 1e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.699317216873169, + "num_tokens": 80938832.0, + "step": 3185 + }, + { + "epoch": 0.34987920052712496, + "grad_norm": 2.5863196849823, + "learning_rate": 1e-06, + "loss": 0.9317, + "mean_token_accuracy": 0.7095749378204346, + "num_tokens": 80958507.0, + "step": 3186 + }, + { + "epoch": 0.34998901822973866, + "grad_norm": 2.3276784420013428, + "learning_rate": 1e-06, + "loss": 1.0318, + "mean_token_accuracy": 0.7019269466400146, + "num_tokens": 80983228.0, + "step": 3187 + }, + { + "epoch": 0.3500988359323523, + "grad_norm": 2.4783966541290283, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.7030727863311768, + "num_tokens": 81004940.0, + "step": 3188 + }, + { + "epoch": 0.35020865363496595, + "grad_norm": 2.2828807830810547, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7154530882835388, + "num_tokens": 81027397.0, + "step": 3189 + }, + { + "epoch": 0.3503184713375796, + "grad_norm": 2.753659963607788, + "learning_rate": 1e-06, + "loss": 0.9792, + "mean_token_accuracy": 0.7073884010314941, + "num_tokens": 81045868.0, + "step": 3190 + }, + { + "epoch": 0.3504282890401933, + "grad_norm": 2.117816686630249, + "learning_rate": 1e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.7044239044189453, + "num_tokens": 81072746.0, + "step": 3191 + }, + { + "epoch": 0.35053810674280694, + "grad_norm": 2.276160717010498, + "learning_rate": 1e-06, + "loss": 1.0418, + "mean_token_accuracy": 0.6938197016716003, + "num_tokens": 81096723.0, + "step": 3192 + }, + { + "epoch": 0.3506479244454206, + "grad_norm": 2.318152666091919, + "learning_rate": 1e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.7053367495536804, + "num_tokens": 81121471.0, + "step": 3193 + }, + { + "epoch": 0.3507577421480343, + "grad_norm": 2.3299965858459473, + "learning_rate": 1e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.6984388828277588, + "num_tokens": 81146887.0, + "step": 3194 + }, + { + "epoch": 0.35086755985064794, + "grad_norm": 2.1873624324798584, + "learning_rate": 1e-06, + "loss": 1.0123, + "mean_token_accuracy": 0.6923030614852905, + "num_tokens": 81175226.0, + "step": 3195 + }, + { + "epoch": 0.3509773775532616, + "grad_norm": 2.7200589179992676, + "learning_rate": 1e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.7259378433227539, + "num_tokens": 81192205.0, + "step": 3196 + }, + { + "epoch": 0.35108719525587523, + "grad_norm": 2.3778584003448486, + "learning_rate": 1e-06, + "loss": 1.0145, + "mean_token_accuracy": 0.7014180421829224, + "num_tokens": 81214683.0, + "step": 3197 + }, + { + "epoch": 0.35119701295848893, + "grad_norm": 2.263698101043701, + "learning_rate": 1e-06, + "loss": 1.0259, + "mean_token_accuracy": 0.6995276212692261, + "num_tokens": 81240276.0, + "step": 3198 + }, + { + "epoch": 0.3513068306611026, + "grad_norm": 2.1919734477996826, + "learning_rate": 1e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.6899881958961487, + "num_tokens": 81265422.0, + "step": 3199 + }, + { + "epoch": 0.3514166483637162, + "grad_norm": 2.13041090965271, + "learning_rate": 1e-06, + "loss": 1.0557, + "mean_token_accuracy": 0.6895540952682495, + "num_tokens": 81292953.0, + "step": 3200 + }, + { + "epoch": 0.35152646606632987, + "grad_norm": 2.7391269207000732, + "learning_rate": 1e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.7167330980300903, + "num_tokens": 81311139.0, + "step": 3201 + }, + { + "epoch": 0.35163628376894357, + "grad_norm": 2.17063045501709, + "learning_rate": 1e-06, + "loss": 0.9375, + "mean_token_accuracy": 0.7140051126480103, + "num_tokens": 81338737.0, + "step": 3202 + }, + { + "epoch": 0.3517461014715572, + "grad_norm": 2.1220180988311768, + "learning_rate": 1e-06, + "loss": 1.0257, + "mean_token_accuracy": 0.6855983734130859, + "num_tokens": 81366960.0, + "step": 3203 + }, + { + "epoch": 0.35185591917417086, + "grad_norm": 2.224569797515869, + "learning_rate": 1e-06, + "loss": 1.0132, + "mean_token_accuracy": 0.6942594647407532, + "num_tokens": 81392922.0, + "step": 3204 + }, + { + "epoch": 0.35196573687678456, + "grad_norm": 2.310697317123413, + "learning_rate": 1e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.7095824480056763, + "num_tokens": 81417401.0, + "step": 3205 + }, + { + "epoch": 0.3520755545793982, + "grad_norm": 1.888899803161621, + "learning_rate": 1e-06, + "loss": 1.0768, + "mean_token_accuracy": 0.6795867681503296, + "num_tokens": 81453359.0, + "step": 3206 + }, + { + "epoch": 0.35218537228201185, + "grad_norm": 2.256739854812622, + "learning_rate": 1e-06, + "loss": 0.993, + "mean_token_accuracy": 0.7047451734542847, + "num_tokens": 81477591.0, + "step": 3207 + }, + { + "epoch": 0.3522951899846255, + "grad_norm": 2.205622673034668, + "learning_rate": 1e-06, + "loss": 1.0353, + "mean_token_accuracy": 0.685883641242981, + "num_tokens": 81504610.0, + "step": 3208 + }, + { + "epoch": 0.3524050076872392, + "grad_norm": 1.9848414659500122, + "learning_rate": 1e-06, + "loss": 1.0783, + "mean_token_accuracy": 0.6831356287002563, + "num_tokens": 81535340.0, + "step": 3209 + }, + { + "epoch": 0.35251482538985285, + "grad_norm": 2.35239315032959, + "learning_rate": 1e-06, + "loss": 0.9032, + "mean_token_accuracy": 0.7222862243652344, + "num_tokens": 81555833.0, + "step": 3210 + }, + { + "epoch": 0.3526246430924665, + "grad_norm": 2.2868778705596924, + "learning_rate": 1e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.701256513595581, + "num_tokens": 81579608.0, + "step": 3211 + }, + { + "epoch": 0.35273446079508014, + "grad_norm": 2.167703628540039, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7097868919372559, + "num_tokens": 81605085.0, + "step": 3212 + }, + { + "epoch": 0.35284427849769384, + "grad_norm": 1.9518855810165405, + "learning_rate": 1e-06, + "loss": 0.9843, + "mean_token_accuracy": 0.702634334564209, + "num_tokens": 81635437.0, + "step": 3213 + }, + { + "epoch": 0.3529540962003075, + "grad_norm": 2.326432704925537, + "learning_rate": 1e-06, + "loss": 1.0548, + "mean_token_accuracy": 0.6886903047561646, + "num_tokens": 81660202.0, + "step": 3214 + }, + { + "epoch": 0.35306391390292113, + "grad_norm": 2.3427796363830566, + "learning_rate": 1e-06, + "loss": 1.007, + "mean_token_accuracy": 0.6942195296287537, + "num_tokens": 81683653.0, + "step": 3215 + }, + { + "epoch": 0.35317373160553484, + "grad_norm": 2.027587413787842, + "learning_rate": 1e-06, + "loss": 1.0517, + "mean_token_accuracy": 0.6824370622634888, + "num_tokens": 81715291.0, + "step": 3216 + }, + { + "epoch": 0.3532835493081485, + "grad_norm": 2.299875020980835, + "learning_rate": 1e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.7063708305358887, + "num_tokens": 81740561.0, + "step": 3217 + }, + { + "epoch": 0.3533933670107621, + "grad_norm": 2.284572124481201, + "learning_rate": 1e-06, + "loss": 1.0207, + "mean_token_accuracy": 0.6973637938499451, + "num_tokens": 81764778.0, + "step": 3218 + }, + { + "epoch": 0.3535031847133758, + "grad_norm": 2.2921128273010254, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.7002096772193909, + "num_tokens": 81787292.0, + "step": 3219 + }, + { + "epoch": 0.3536130024159895, + "grad_norm": 2.0316123962402344, + "learning_rate": 1e-06, + "loss": 1.0391, + "mean_token_accuracy": 0.6862842440605164, + "num_tokens": 81817426.0, + "step": 3220 + }, + { + "epoch": 0.3537228201186031, + "grad_norm": 2.298154830932617, + "learning_rate": 1e-06, + "loss": 1.054, + "mean_token_accuracy": 0.6898597478866577, + "num_tokens": 81842691.0, + "step": 3221 + }, + { + "epoch": 0.35383263782121677, + "grad_norm": 2.105320453643799, + "learning_rate": 1e-06, + "loss": 1.1084, + "mean_token_accuracy": 0.6685747504234314, + "num_tokens": 81872699.0, + "step": 3222 + }, + { + "epoch": 0.35394245552383047, + "grad_norm": 2.378634214401245, + "learning_rate": 1e-06, + "loss": 0.9853, + "mean_token_accuracy": 0.6968504786491394, + "num_tokens": 81894693.0, + "step": 3223 + }, + { + "epoch": 0.3540522732264441, + "grad_norm": 2.375980854034424, + "learning_rate": 1e-06, + "loss": 1.0732, + "mean_token_accuracy": 0.6782643795013428, + "num_tokens": 81919516.0, + "step": 3224 + }, + { + "epoch": 0.35416209092905776, + "grad_norm": 2.442351818084717, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7240480184555054, + "num_tokens": 81942031.0, + "step": 3225 + }, + { + "epoch": 0.3542719086316714, + "grad_norm": 2.4336299896240234, + "learning_rate": 1e-06, + "loss": 0.9084, + "mean_token_accuracy": 0.7221705913543701, + "num_tokens": 81963239.0, + "step": 3226 + }, + { + "epoch": 0.3543817263342851, + "grad_norm": 2.014223098754883, + "learning_rate": 1e-06, + "loss": 1.0745, + "mean_token_accuracy": 0.6860429048538208, + "num_tokens": 81994784.0, + "step": 3227 + }, + { + "epoch": 0.35449154403689875, + "grad_norm": 2.3199188709259033, + "learning_rate": 1e-06, + "loss": 0.9193, + "mean_token_accuracy": 0.7146744728088379, + "num_tokens": 82017782.0, + "step": 3228 + }, + { + "epoch": 0.3546013617395124, + "grad_norm": 2.352726697921753, + "learning_rate": 1e-06, + "loss": 0.8724, + "mean_token_accuracy": 0.7401304841041565, + "num_tokens": 82038805.0, + "step": 3229 + }, + { + "epoch": 0.35471117944212605, + "grad_norm": 2.247652530670166, + "learning_rate": 1e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.7073303461074829, + "num_tokens": 82062554.0, + "step": 3230 + }, + { + "epoch": 0.35482099714473975, + "grad_norm": 2.5785534381866455, + "learning_rate": 1e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.700853705406189, + "num_tokens": 82084040.0, + "step": 3231 + }, + { + "epoch": 0.3549308148473534, + "grad_norm": 2.1485438346862793, + "learning_rate": 1e-06, + "loss": 1.0382, + "mean_token_accuracy": 0.6855230331420898, + "num_tokens": 82113463.0, + "step": 3232 + }, + { + "epoch": 0.35504063254996704, + "grad_norm": 2.2723276615142822, + "learning_rate": 1e-06, + "loss": 0.8917, + "mean_token_accuracy": 0.7238015532493591, + "num_tokens": 82138047.0, + "step": 3233 + }, + { + "epoch": 0.35515045025258074, + "grad_norm": 2.4822282791137695, + "learning_rate": 1e-06, + "loss": 1.0196, + "mean_token_accuracy": 0.6947625279426575, + "num_tokens": 82161219.0, + "step": 3234 + }, + { + "epoch": 0.3552602679551944, + "grad_norm": 2.258585214614868, + "learning_rate": 1e-06, + "loss": 0.9183, + "mean_token_accuracy": 0.715041995048523, + "num_tokens": 82185679.0, + "step": 3235 + }, + { + "epoch": 0.35537008565780803, + "grad_norm": 2.1152899265289307, + "learning_rate": 1e-06, + "loss": 1.0631, + "mean_token_accuracy": 0.681179404258728, + "num_tokens": 82213343.0, + "step": 3236 + }, + { + "epoch": 0.3554799033604217, + "grad_norm": 2.219280481338501, + "learning_rate": 1e-06, + "loss": 1.0206, + "mean_token_accuracy": 0.6980890035629272, + "num_tokens": 82241157.0, + "step": 3237 + }, + { + "epoch": 0.3555897210630354, + "grad_norm": 2.0950393676757812, + "learning_rate": 1e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.7211028337478638, + "num_tokens": 82267745.0, + "step": 3238 + }, + { + "epoch": 0.355699538765649, + "grad_norm": 2.3265955448150635, + "learning_rate": 1e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.7052320241928101, + "num_tokens": 82290689.0, + "step": 3239 + }, + { + "epoch": 0.35580935646826267, + "grad_norm": 2.019284725189209, + "learning_rate": 1e-06, + "loss": 0.9449, + "mean_token_accuracy": 0.7114802598953247, + "num_tokens": 82321375.0, + "step": 3240 + }, + { + "epoch": 0.3559191741708763, + "grad_norm": 2.498149871826172, + "learning_rate": 1e-06, + "loss": 0.9189, + "mean_token_accuracy": 0.7157071828842163, + "num_tokens": 82352885.0, + "step": 3241 + }, + { + "epoch": 0.35602899187349, + "grad_norm": 2.469827175140381, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7174443006515503, + "num_tokens": 82373782.0, + "step": 3242 + }, + { + "epoch": 0.35613880957610367, + "grad_norm": 2.4822568893432617, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.7053799033164978, + "num_tokens": 82395796.0, + "step": 3243 + }, + { + "epoch": 0.3562486272787173, + "grad_norm": 2.49076771736145, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7006102800369263, + "num_tokens": 82417684.0, + "step": 3244 + }, + { + "epoch": 0.356358444981331, + "grad_norm": 2.0271799564361572, + "learning_rate": 1e-06, + "loss": 0.9776, + "mean_token_accuracy": 0.6976944804191589, + "num_tokens": 82446557.0, + "step": 3245 + }, + { + "epoch": 0.35646826268394466, + "grad_norm": 2.605543613433838, + "learning_rate": 1e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.7015701532363892, + "num_tokens": 82467561.0, + "step": 3246 + }, + { + "epoch": 0.3565780803865583, + "grad_norm": 2.038637399673462, + "learning_rate": 1e-06, + "loss": 0.9704, + "mean_token_accuracy": 0.7123219966888428, + "num_tokens": 82497308.0, + "step": 3247 + }, + { + "epoch": 0.35668789808917195, + "grad_norm": 2.4218828678131104, + "learning_rate": 1e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.7166763544082642, + "num_tokens": 82517712.0, + "step": 3248 + }, + { + "epoch": 0.35679771579178565, + "grad_norm": 2.0423085689544678, + "learning_rate": 1e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.7017416954040527, + "num_tokens": 82547926.0, + "step": 3249 + }, + { + "epoch": 0.3569075334943993, + "grad_norm": 2.2769365310668945, + "learning_rate": 1e-06, + "loss": 1.0149, + "mean_token_accuracy": 0.6965180039405823, + "num_tokens": 82573961.0, + "step": 3250 + }, + { + "epoch": 0.35701735119701294, + "grad_norm": 2.5500378608703613, + "learning_rate": 1e-06, + "loss": 1.041, + "mean_token_accuracy": 0.6928762197494507, + "num_tokens": 82594650.0, + "step": 3251 + }, + { + "epoch": 0.35712716889962665, + "grad_norm": 1.9700959920883179, + "learning_rate": 1e-06, + "loss": 0.9261, + "mean_token_accuracy": 0.7148059606552124, + "num_tokens": 82624484.0, + "step": 3252 + }, + { + "epoch": 0.3572369866022403, + "grad_norm": 2.225813388824463, + "learning_rate": 1e-06, + "loss": 1.0251, + "mean_token_accuracy": 0.696105420589447, + "num_tokens": 82649015.0, + "step": 3253 + }, + { + "epoch": 0.35734680430485394, + "grad_norm": 2.6339282989501953, + "learning_rate": 1e-06, + "loss": 1.0425, + "mean_token_accuracy": 0.6910824775695801, + "num_tokens": 82671758.0, + "step": 3254 + }, + { + "epoch": 0.3574566220074676, + "grad_norm": 2.3420825004577637, + "learning_rate": 1e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.7010843753814697, + "num_tokens": 82698213.0, + "step": 3255 + }, + { + "epoch": 0.3575664397100813, + "grad_norm": 2.260603189468384, + "learning_rate": 1e-06, + "loss": 0.9523, + "mean_token_accuracy": 0.705048680305481, + "num_tokens": 82724044.0, + "step": 3256 + }, + { + "epoch": 0.35767625741269493, + "grad_norm": 2.1570191383361816, + "learning_rate": 1e-06, + "loss": 1.0539, + "mean_token_accuracy": 0.6932312846183777, + "num_tokens": 82752858.0, + "step": 3257 + }, + { + "epoch": 0.3577860751153086, + "grad_norm": 2.261559009552002, + "learning_rate": 1e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.6980496048927307, + "num_tokens": 82777100.0, + "step": 3258 + }, + { + "epoch": 0.3578958928179222, + "grad_norm": 2.567789077758789, + "learning_rate": 1e-06, + "loss": 0.9748, + "mean_token_accuracy": 0.7087270617485046, + "num_tokens": 82796900.0, + "step": 3259 + }, + { + "epoch": 0.3580057105205359, + "grad_norm": 2.011579751968384, + "learning_rate": 1e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.7124876976013184, + "num_tokens": 82826217.0, + "step": 3260 + }, + { + "epoch": 0.35811552822314957, + "grad_norm": 2.4034500122070312, + "learning_rate": 1e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.704128623008728, + "num_tokens": 82847924.0, + "step": 3261 + }, + { + "epoch": 0.3582253459257632, + "grad_norm": 2.269589900970459, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.6980404853820801, + "num_tokens": 82872739.0, + "step": 3262 + }, + { + "epoch": 0.3583351636283769, + "grad_norm": 2.3377089500427246, + "learning_rate": 1e-06, + "loss": 0.9884, + "mean_token_accuracy": 0.7005269527435303, + "num_tokens": 82897222.0, + "step": 3263 + }, + { + "epoch": 0.35844498133099056, + "grad_norm": 2.5599212646484375, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.7181057929992676, + "num_tokens": 82918570.0, + "step": 3264 + }, + { + "epoch": 0.3585547990336042, + "grad_norm": 2.1958489418029785, + "learning_rate": 1e-06, + "loss": 1.0541, + "mean_token_accuracy": 0.6800801753997803, + "num_tokens": 82945401.0, + "step": 3265 + }, + { + "epoch": 0.35866461673621786, + "grad_norm": 2.114978790283203, + "learning_rate": 1e-06, + "loss": 1.089, + "mean_token_accuracy": 0.6778510808944702, + "num_tokens": 82975769.0, + "step": 3266 + }, + { + "epoch": 0.35877443443883156, + "grad_norm": 2.4773783683776855, + "learning_rate": 1e-06, + "loss": 1.0423, + "mean_token_accuracy": 0.6814885139465332, + "num_tokens": 82997644.0, + "step": 3267 + }, + { + "epoch": 0.3588842521414452, + "grad_norm": 2.201725959777832, + "learning_rate": 1e-06, + "loss": 1.0657, + "mean_token_accuracy": 0.678964376449585, + "num_tokens": 83026270.0, + "step": 3268 + }, + { + "epoch": 0.35899406984405885, + "grad_norm": 2.289151668548584, + "learning_rate": 1e-06, + "loss": 1.0217, + "mean_token_accuracy": 0.7034438252449036, + "num_tokens": 83050529.0, + "step": 3269 + }, + { + "epoch": 0.35910388754667255, + "grad_norm": 2.0353965759277344, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.6986401081085205, + "num_tokens": 83080103.0, + "step": 3270 + }, + { + "epoch": 0.3592137052492862, + "grad_norm": 2.413635492324829, + "learning_rate": 1e-06, + "loss": 1.0162, + "mean_token_accuracy": 0.6954082250595093, + "num_tokens": 83103034.0, + "step": 3271 + }, + { + "epoch": 0.35932352295189984, + "grad_norm": 2.193441867828369, + "learning_rate": 1e-06, + "loss": 0.9926, + "mean_token_accuracy": 0.6933634281158447, + "num_tokens": 83127692.0, + "step": 3272 + }, + { + "epoch": 0.3594333406545135, + "grad_norm": 2.6466565132141113, + "learning_rate": 1e-06, + "loss": 0.9856, + "mean_token_accuracy": 0.7088083624839783, + "num_tokens": 83147054.0, + "step": 3273 + }, + { + "epoch": 0.3595431583571272, + "grad_norm": 2.155728340148926, + "learning_rate": 1e-06, + "loss": 1.0552, + "mean_token_accuracy": 0.6845659613609314, + "num_tokens": 83174074.0, + "step": 3274 + }, + { + "epoch": 0.35965297605974084, + "grad_norm": 2.461595058441162, + "learning_rate": 1e-06, + "loss": 1.0146, + "mean_token_accuracy": 0.6957963109016418, + "num_tokens": 83196861.0, + "step": 3275 + }, + { + "epoch": 0.3597627937623545, + "grad_norm": 2.3038787841796875, + "learning_rate": 1e-06, + "loss": 1.0598, + "mean_token_accuracy": 0.681470513343811, + "num_tokens": 83223193.0, + "step": 3276 + }, + { + "epoch": 0.35987261146496813, + "grad_norm": 2.46063494682312, + "learning_rate": 1e-06, + "loss": 1.0176, + "mean_token_accuracy": 0.6944134831428528, + "num_tokens": 83244791.0, + "step": 3277 + }, + { + "epoch": 0.35998242916758183, + "grad_norm": 2.0827956199645996, + "learning_rate": 1e-06, + "loss": 1.0414, + "mean_token_accuracy": 0.6913650035858154, + "num_tokens": 83272677.0, + "step": 3278 + }, + { + "epoch": 0.3600922468701955, + "grad_norm": 2.1803383827209473, + "learning_rate": 1e-06, + "loss": 1.0964, + "mean_token_accuracy": 0.6795794367790222, + "num_tokens": 83300943.0, + "step": 3279 + }, + { + "epoch": 0.3602020645728091, + "grad_norm": 2.705963611602783, + "learning_rate": 1e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.7069694399833679, + "num_tokens": 83318819.0, + "step": 3280 + }, + { + "epoch": 0.3603118822754228, + "grad_norm": 2.3818721771240234, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7080847024917603, + "num_tokens": 83341049.0, + "step": 3281 + }, + { + "epoch": 0.36042169997803647, + "grad_norm": 2.4309513568878174, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7104448676109314, + "num_tokens": 83363253.0, + "step": 3282 + }, + { + "epoch": 0.3605315176806501, + "grad_norm": 2.1808457374572754, + "learning_rate": 1e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.7180209159851074, + "num_tokens": 83388285.0, + "step": 3283 + }, + { + "epoch": 0.36064133538326376, + "grad_norm": 2.122774600982666, + "learning_rate": 1e-06, + "loss": 1.112, + "mean_token_accuracy": 0.6676037311553955, + "num_tokens": 83417683.0, + "step": 3284 + }, + { + "epoch": 0.36075115308587746, + "grad_norm": 2.5536372661590576, + "learning_rate": 1e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.7220999002456665, + "num_tokens": 83437400.0, + "step": 3285 + }, + { + "epoch": 0.3608609707884911, + "grad_norm": 2.593005657196045, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7082213163375854, + "num_tokens": 83456704.0, + "step": 3286 + }, + { + "epoch": 0.36097078849110475, + "grad_norm": 2.7388010025024414, + "learning_rate": 1e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.7086843252182007, + "num_tokens": 83473995.0, + "step": 3287 + }, + { + "epoch": 0.3610806061937184, + "grad_norm": 1.9383703470230103, + "learning_rate": 1e-06, + "loss": 1.0365, + "mean_token_accuracy": 0.683460533618927, + "num_tokens": 83504631.0, + "step": 3288 + }, + { + "epoch": 0.3611904238963321, + "grad_norm": 2.2015023231506348, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.7099372744560242, + "num_tokens": 83529528.0, + "step": 3289 + }, + { + "epoch": 0.36130024159894575, + "grad_norm": 2.333493947982788, + "learning_rate": 1e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.7307994365692139, + "num_tokens": 83550586.0, + "step": 3290 + }, + { + "epoch": 0.3614100593015594, + "grad_norm": 2.167330503463745, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7073225378990173, + "num_tokens": 83578037.0, + "step": 3291 + }, + { + "epoch": 0.3615198770041731, + "grad_norm": 2.2756857872009277, + "learning_rate": 1e-06, + "loss": 1.0675, + "mean_token_accuracy": 0.6776852607727051, + "num_tokens": 83603926.0, + "step": 3292 + }, + { + "epoch": 0.36162969470678674, + "grad_norm": 2.4124433994293213, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.7064822316169739, + "num_tokens": 83628563.0, + "step": 3293 + }, + { + "epoch": 0.3617395124094004, + "grad_norm": 2.19004487991333, + "learning_rate": 1e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.7006681561470032, + "num_tokens": 83656453.0, + "step": 3294 + }, + { + "epoch": 0.36184933011201403, + "grad_norm": 2.2713961601257324, + "learning_rate": 1e-06, + "loss": 1.1106, + "mean_token_accuracy": 0.6708185076713562, + "num_tokens": 83685542.0, + "step": 3295 + }, + { + "epoch": 0.36195914781462774, + "grad_norm": 2.061525821685791, + "learning_rate": 1e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.7034454345703125, + "num_tokens": 83715817.0, + "step": 3296 + }, + { + "epoch": 0.3620689655172414, + "grad_norm": 1.9088022708892822, + "learning_rate": 1e-06, + "loss": 1.0605, + "mean_token_accuracy": 0.6818997859954834, + "num_tokens": 83752258.0, + "step": 3297 + }, + { + "epoch": 0.362178783219855, + "grad_norm": 2.217294454574585, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7090559005737305, + "num_tokens": 83776872.0, + "step": 3298 + }, + { + "epoch": 0.36228860092246873, + "grad_norm": 1.8679980039596558, + "learning_rate": 1e-06, + "loss": 1.0295, + "mean_token_accuracy": 0.6897291541099548, + "num_tokens": 83810493.0, + "step": 3299 + }, + { + "epoch": 0.3623984186250824, + "grad_norm": 2.1635172367095947, + "learning_rate": 1e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.7003843784332275, + "num_tokens": 83836516.0, + "step": 3300 + }, + { + "epoch": 0.362508236327696, + "grad_norm": 2.206789255142212, + "learning_rate": 1e-06, + "loss": 1.0941, + "mean_token_accuracy": 0.677564263343811, + "num_tokens": 83867356.0, + "step": 3301 + }, + { + "epoch": 0.36261805403030967, + "grad_norm": 2.6045310497283936, + "learning_rate": 1e-06, + "loss": 0.9049, + "mean_token_accuracy": 0.7250288724899292, + "num_tokens": 83887017.0, + "step": 3302 + }, + { + "epoch": 0.36272787173292337, + "grad_norm": 2.258450508117676, + "learning_rate": 1e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.7165969610214233, + "num_tokens": 83911727.0, + "step": 3303 + }, + { + "epoch": 0.362837689435537, + "grad_norm": 2.2865653038024902, + "learning_rate": 1e-06, + "loss": 1.0147, + "mean_token_accuracy": 0.7074933052062988, + "num_tokens": 83937310.0, + "step": 3304 + }, + { + "epoch": 0.36294750713815066, + "grad_norm": 2.238830089569092, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.7168010473251343, + "num_tokens": 83960065.0, + "step": 3305 + }, + { + "epoch": 0.3630573248407643, + "grad_norm": 2.2847909927368164, + "learning_rate": 1e-06, + "loss": 1.0531, + "mean_token_accuracy": 0.6952176094055176, + "num_tokens": 83984989.0, + "step": 3306 + }, + { + "epoch": 0.363167142543378, + "grad_norm": 2.613069772720337, + "learning_rate": 1e-06, + "loss": 0.9958, + "mean_token_accuracy": 0.6966038942337036, + "num_tokens": 84008598.0, + "step": 3307 + }, + { + "epoch": 0.36327696024599165, + "grad_norm": 2.4545443058013916, + "learning_rate": 1e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.7013373970985413, + "num_tokens": 84032161.0, + "step": 3308 + }, + { + "epoch": 0.3633867779486053, + "grad_norm": 2.523820638656616, + "learning_rate": 1e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.6915360689163208, + "num_tokens": 84052829.0, + "step": 3309 + }, + { + "epoch": 0.363496595651219, + "grad_norm": 2.2671878337860107, + "learning_rate": 1e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.6985273361206055, + "num_tokens": 84076460.0, + "step": 3310 + }, + { + "epoch": 0.36360641335383265, + "grad_norm": 2.248872995376587, + "learning_rate": 1e-06, + "loss": 1.0071, + "mean_token_accuracy": 0.6970772743225098, + "num_tokens": 84100341.0, + "step": 3311 + }, + { + "epoch": 0.3637162310564463, + "grad_norm": 2.204876661300659, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7085462808609009, + "num_tokens": 84127718.0, + "step": 3312 + }, + { + "epoch": 0.36382604875905994, + "grad_norm": 2.175124406814575, + "learning_rate": 1e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.6989035606384277, + "num_tokens": 84155784.0, + "step": 3313 + }, + { + "epoch": 0.36393586646167364, + "grad_norm": 2.024604320526123, + "learning_rate": 1e-06, + "loss": 1.0662, + "mean_token_accuracy": 0.6897903680801392, + "num_tokens": 84187944.0, + "step": 3314 + }, + { + "epoch": 0.3640456841642873, + "grad_norm": 1.9571645259857178, + "learning_rate": 1e-06, + "loss": 1.0287, + "mean_token_accuracy": 0.6875877380371094, + "num_tokens": 84219444.0, + "step": 3315 + }, + { + "epoch": 0.36415550186690093, + "grad_norm": 2.3873279094696045, + "learning_rate": 1e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.7041814923286438, + "num_tokens": 84241786.0, + "step": 3316 + }, + { + "epoch": 0.3642653195695146, + "grad_norm": 2.3283631801605225, + "learning_rate": 1e-06, + "loss": 1.0913, + "mean_token_accuracy": 0.6734361052513123, + "num_tokens": 84267333.0, + "step": 3317 + }, + { + "epoch": 0.3643751372721283, + "grad_norm": 2.0276784896850586, + "learning_rate": 1e-06, + "loss": 1.0736, + "mean_token_accuracy": 0.6828486919403076, + "num_tokens": 84296604.0, + "step": 3318 + }, + { + "epoch": 0.3644849549747419, + "grad_norm": 1.974708080291748, + "learning_rate": 1e-06, + "loss": 1.0565, + "mean_token_accuracy": 0.6815140247344971, + "num_tokens": 84327590.0, + "step": 3319 + }, + { + "epoch": 0.36459477267735557, + "grad_norm": 2.33728289604187, + "learning_rate": 1e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7257368564605713, + "num_tokens": 84351332.0, + "step": 3320 + }, + { + "epoch": 0.3647045903799693, + "grad_norm": 2.2558891773223877, + "learning_rate": 1e-06, + "loss": 0.9446, + "mean_token_accuracy": 0.7152738571166992, + "num_tokens": 84377278.0, + "step": 3321 + }, + { + "epoch": 0.3648144080825829, + "grad_norm": 1.9775493144989014, + "learning_rate": 1e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.6970060467720032, + "num_tokens": 84409961.0, + "step": 3322 + }, + { + "epoch": 0.36492422578519657, + "grad_norm": 2.4712021350860596, + "learning_rate": 1e-06, + "loss": 0.9497, + "mean_token_accuracy": 0.7046008110046387, + "num_tokens": 84430900.0, + "step": 3323 + }, + { + "epoch": 0.3650340434878102, + "grad_norm": 2.2342567443847656, + "learning_rate": 1e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7087163925170898, + "num_tokens": 84456819.0, + "step": 3324 + }, + { + "epoch": 0.3651438611904239, + "grad_norm": 2.1118149757385254, + "learning_rate": 1e-06, + "loss": 0.9524, + "mean_token_accuracy": 0.7098259925842285, + "num_tokens": 84483719.0, + "step": 3325 + }, + { + "epoch": 0.36525367889303756, + "grad_norm": 2.5396933555603027, + "learning_rate": 1e-06, + "loss": 1.0153, + "mean_token_accuracy": 0.6947404146194458, + "num_tokens": 84504855.0, + "step": 3326 + }, + { + "epoch": 0.3653634965956512, + "grad_norm": 2.5154428482055664, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.7060067653656006, + "num_tokens": 84525871.0, + "step": 3327 + }, + { + "epoch": 0.3654733142982649, + "grad_norm": 2.2123947143554688, + "learning_rate": 1e-06, + "loss": 0.9266, + "mean_token_accuracy": 0.7117373943328857, + "num_tokens": 84552983.0, + "step": 3328 + }, + { + "epoch": 0.36558313200087855, + "grad_norm": 2.4161829948425293, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7090708017349243, + "num_tokens": 84576196.0, + "step": 3329 + }, + { + "epoch": 0.3656929497034922, + "grad_norm": 1.9697644710540771, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7067199945449829, + "num_tokens": 84607598.0, + "step": 3330 + }, + { + "epoch": 0.36580276740610584, + "grad_norm": 2.0769758224487305, + "learning_rate": 1e-06, + "loss": 1.0021, + "mean_token_accuracy": 0.6946194171905518, + "num_tokens": 84636213.0, + "step": 3331 + }, + { + "epoch": 0.36591258510871955, + "grad_norm": 2.5930535793304443, + "learning_rate": 1e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7164998054504395, + "num_tokens": 84654488.0, + "step": 3332 + }, + { + "epoch": 0.3660224028113332, + "grad_norm": 2.682864189147949, + "learning_rate": 1e-06, + "loss": 1.0005, + "mean_token_accuracy": 0.6943430304527283, + "num_tokens": 84674309.0, + "step": 3333 + }, + { + "epoch": 0.36613222051394684, + "grad_norm": 1.9882144927978516, + "learning_rate": 1e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.7154017686843872, + "num_tokens": 84704032.0, + "step": 3334 + }, + { + "epoch": 0.3662420382165605, + "grad_norm": 2.1317813396453857, + "learning_rate": 1e-06, + "loss": 0.8901, + "mean_token_accuracy": 0.7299294471740723, + "num_tokens": 84729154.0, + "step": 3335 + }, + { + "epoch": 0.3663518559191742, + "grad_norm": 2.344303607940674, + "learning_rate": 1e-06, + "loss": 1.0063, + "mean_token_accuracy": 0.6989192366600037, + "num_tokens": 84753428.0, + "step": 3336 + }, + { + "epoch": 0.36646167362178783, + "grad_norm": 2.269092321395874, + "learning_rate": 1e-06, + "loss": 1.084, + "mean_token_accuracy": 0.6796432733535767, + "num_tokens": 84781336.0, + "step": 3337 + }, + { + "epoch": 0.3665714913244015, + "grad_norm": 2.1841225624084473, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7046673893928528, + "num_tokens": 84806694.0, + "step": 3338 + }, + { + "epoch": 0.3666813090270152, + "grad_norm": 2.2489023208618164, + "learning_rate": 1e-06, + "loss": 1.0108, + "mean_token_accuracy": 0.6962323784828186, + "num_tokens": 84831215.0, + "step": 3339 + }, + { + "epoch": 0.3667911267296288, + "grad_norm": 2.387308359146118, + "learning_rate": 1e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.7041528224945068, + "num_tokens": 84853917.0, + "step": 3340 + }, + { + "epoch": 0.36690094443224247, + "grad_norm": 2.313650608062744, + "learning_rate": 1e-06, + "loss": 1.0475, + "mean_token_accuracy": 0.69474196434021, + "num_tokens": 84878430.0, + "step": 3341 + }, + { + "epoch": 0.3670107621348561, + "grad_norm": 2.0300254821777344, + "learning_rate": 1e-06, + "loss": 1.0516, + "mean_token_accuracy": 0.689593493938446, + "num_tokens": 84907909.0, + "step": 3342 + }, + { + "epoch": 0.3671205798374698, + "grad_norm": 2.8236470222473145, + "learning_rate": 1e-06, + "loss": 0.8138, + "mean_token_accuracy": 0.7463504076004028, + "num_tokens": 84924064.0, + "step": 3343 + }, + { + "epoch": 0.36723039754008346, + "grad_norm": 2.4058427810668945, + "learning_rate": 1e-06, + "loss": 0.9823, + "mean_token_accuracy": 0.6970162987709045, + "num_tokens": 84946416.0, + "step": 3344 + }, + { + "epoch": 0.3673402152426971, + "grad_norm": 2.1993250846862793, + "learning_rate": 1e-06, + "loss": 1.0857, + "mean_token_accuracy": 0.6722151041030884, + "num_tokens": 84974204.0, + "step": 3345 + }, + { + "epoch": 0.3674500329453108, + "grad_norm": 2.0580339431762695, + "learning_rate": 1e-06, + "loss": 1.0007, + "mean_token_accuracy": 0.6993533372879028, + "num_tokens": 85002457.0, + "step": 3346 + }, + { + "epoch": 0.36755985064792446, + "grad_norm": 2.2507550716400146, + "learning_rate": 1e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.7038357853889465, + "num_tokens": 85029267.0, + "step": 3347 + }, + { + "epoch": 0.3676696683505381, + "grad_norm": 2.6289236545562744, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7072319984436035, + "num_tokens": 85050744.0, + "step": 3348 + }, + { + "epoch": 0.36777948605315175, + "grad_norm": 2.5803651809692383, + "learning_rate": 1e-06, + "loss": 1.0297, + "mean_token_accuracy": 0.6883610486984253, + "num_tokens": 85071472.0, + "step": 3349 + }, + { + "epoch": 0.36788930375576545, + "grad_norm": 2.4949707984924316, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7039718627929688, + "num_tokens": 85093234.0, + "step": 3350 + }, + { + "epoch": 0.3679991214583791, + "grad_norm": 2.293036699295044, + "learning_rate": 1e-06, + "loss": 0.9802, + "mean_token_accuracy": 0.6986710429191589, + "num_tokens": 85117302.0, + "step": 3351 + }, + { + "epoch": 0.36810893916099274, + "grad_norm": 2.3119914531707764, + "learning_rate": 1e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.7087141275405884, + "num_tokens": 85142136.0, + "step": 3352 + }, + { + "epoch": 0.3682187568636064, + "grad_norm": 2.330040216445923, + "learning_rate": 1e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.7107589244842529, + "num_tokens": 85165886.0, + "step": 3353 + }, + { + "epoch": 0.3683285745662201, + "grad_norm": 2.838106632232666, + "learning_rate": 1e-06, + "loss": 1.0048, + "mean_token_accuracy": 0.6906695365905762, + "num_tokens": 85195289.0, + "step": 3354 + }, + { + "epoch": 0.36843839226883374, + "grad_norm": 2.3800811767578125, + "learning_rate": 1e-06, + "loss": 0.9546, + "mean_token_accuracy": 0.7040877342224121, + "num_tokens": 85218695.0, + "step": 3355 + }, + { + "epoch": 0.3685482099714474, + "grad_norm": 2.1412599086761475, + "learning_rate": 1e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.706463634967804, + "num_tokens": 85247979.0, + "step": 3356 + }, + { + "epoch": 0.3686580276740611, + "grad_norm": 2.770172595977783, + "learning_rate": 1e-06, + "loss": 1.1146, + "mean_token_accuracy": 0.6711621880531311, + "num_tokens": 85277340.0, + "step": 3357 + }, + { + "epoch": 0.36876784537667473, + "grad_norm": 2.898930549621582, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7115799188613892, + "num_tokens": 85304323.0, + "step": 3358 + }, + { + "epoch": 0.3688776630792884, + "grad_norm": 2.5895705223083496, + "learning_rate": 1e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.6863667964935303, + "num_tokens": 85325267.0, + "step": 3359 + }, + { + "epoch": 0.368987480781902, + "grad_norm": 2.3313887119293213, + "learning_rate": 1e-06, + "loss": 1.0243, + "mean_token_accuracy": 0.6863020658493042, + "num_tokens": 85349814.0, + "step": 3360 + }, + { + "epoch": 0.3690972984845157, + "grad_norm": 2.4789388179779053, + "learning_rate": 1e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.7024675011634827, + "num_tokens": 85370625.0, + "step": 3361 + }, + { + "epoch": 0.36920711618712937, + "grad_norm": 2.0111589431762695, + "learning_rate": 1e-06, + "loss": 0.9766, + "mean_token_accuracy": 0.698826789855957, + "num_tokens": 85399888.0, + "step": 3362 + }, + { + "epoch": 0.369316933889743, + "grad_norm": 2.2159605026245117, + "learning_rate": 1e-06, + "loss": 1.045, + "mean_token_accuracy": 0.6870702505111694, + "num_tokens": 85427408.0, + "step": 3363 + }, + { + "epoch": 0.36942675159235666, + "grad_norm": 2.493967294692993, + "learning_rate": 1e-06, + "loss": 0.9902, + "mean_token_accuracy": 0.6926422119140625, + "num_tokens": 85448361.0, + "step": 3364 + }, + { + "epoch": 0.36953656929497036, + "grad_norm": 2.4355387687683105, + "learning_rate": 1e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.699236273765564, + "num_tokens": 85471880.0, + "step": 3365 + }, + { + "epoch": 0.369646386997584, + "grad_norm": 2.3130083084106445, + "learning_rate": 1e-06, + "loss": 1.0878, + "mean_token_accuracy": 0.6767439246177673, + "num_tokens": 85496920.0, + "step": 3366 + }, + { + "epoch": 0.36975620470019765, + "grad_norm": 2.1477091312408447, + "learning_rate": 1e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7204210758209229, + "num_tokens": 85523970.0, + "step": 3367 + }, + { + "epoch": 0.36986602240281136, + "grad_norm": 2.074815273284912, + "learning_rate": 1e-06, + "loss": 1.0143, + "mean_token_accuracy": 0.6943118572235107, + "num_tokens": 85554994.0, + "step": 3368 + }, + { + "epoch": 0.369975840105425, + "grad_norm": 2.3512508869171143, + "learning_rate": 1e-06, + "loss": 0.9751, + "mean_token_accuracy": 0.7004708647727966, + "num_tokens": 85578399.0, + "step": 3369 + }, + { + "epoch": 0.37008565780803865, + "grad_norm": 2.1808021068573, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.7008262872695923, + "num_tokens": 85604575.0, + "step": 3370 + }, + { + "epoch": 0.3701954755106523, + "grad_norm": 2.244866371154785, + "learning_rate": 1e-06, + "loss": 0.9279, + "mean_token_accuracy": 0.7161604166030884, + "num_tokens": 85631892.0, + "step": 3371 + }, + { + "epoch": 0.370305293213266, + "grad_norm": 2.2011497020721436, + "learning_rate": 1e-06, + "loss": 0.9943, + "mean_token_accuracy": 0.7005487680435181, + "num_tokens": 85658337.0, + "step": 3372 + }, + { + "epoch": 0.37041511091587964, + "grad_norm": 2.113736867904663, + "learning_rate": 1e-06, + "loss": 1.0182, + "mean_token_accuracy": 0.6899617910385132, + "num_tokens": 85687987.0, + "step": 3373 + }, + { + "epoch": 0.3705249286184933, + "grad_norm": 2.175976514816284, + "learning_rate": 1e-06, + "loss": 1.0171, + "mean_token_accuracy": 0.6985760927200317, + "num_tokens": 85715631.0, + "step": 3374 + }, + { + "epoch": 0.370634746321107, + "grad_norm": 2.222191572189331, + "learning_rate": 1e-06, + "loss": 1.0058, + "mean_token_accuracy": 0.701453447341919, + "num_tokens": 85741043.0, + "step": 3375 + }, + { + "epoch": 0.37074456402372064, + "grad_norm": 2.4796576499938965, + "learning_rate": 1e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.7033097743988037, + "num_tokens": 85761873.0, + "step": 3376 + }, + { + "epoch": 0.3708543817263343, + "grad_norm": 2.0392818450927734, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7195576429367065, + "num_tokens": 85789737.0, + "step": 3377 + }, + { + "epoch": 0.3709641994289479, + "grad_norm": 2.3786368370056152, + "learning_rate": 1e-06, + "loss": 0.8714, + "mean_token_accuracy": 0.725100576877594, + "num_tokens": 85810360.0, + "step": 3378 + }, + { + "epoch": 0.37107401713156163, + "grad_norm": 2.4911444187164307, + "learning_rate": 1e-06, + "loss": 0.9054, + "mean_token_accuracy": 0.7167391777038574, + "num_tokens": 85831541.0, + "step": 3379 + }, + { + "epoch": 0.3711838348341753, + "grad_norm": 1.9912433624267578, + "learning_rate": 1e-06, + "loss": 1.0291, + "mean_token_accuracy": 0.6942225098609924, + "num_tokens": 85862012.0, + "step": 3380 + }, + { + "epoch": 0.3712936525367889, + "grad_norm": 2.251246929168701, + "learning_rate": 1e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.7013155221939087, + "num_tokens": 85887395.0, + "step": 3381 + }, + { + "epoch": 0.37140347023940257, + "grad_norm": 2.340092420578003, + "learning_rate": 1e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.7068905234336853, + "num_tokens": 85910812.0, + "step": 3382 + }, + { + "epoch": 0.37151328794201627, + "grad_norm": 2.0861716270446777, + "learning_rate": 1e-06, + "loss": 0.7587, + "mean_token_accuracy": 0.7589354515075684, + "num_tokens": 85934706.0, + "step": 3383 + }, + { + "epoch": 0.3716231056446299, + "grad_norm": 2.179931402206421, + "learning_rate": 1e-06, + "loss": 0.9983, + "mean_token_accuracy": 0.6984719038009644, + "num_tokens": 85960372.0, + "step": 3384 + }, + { + "epoch": 0.37173292334724356, + "grad_norm": 2.4944396018981934, + "learning_rate": 1e-06, + "loss": 1.014, + "mean_token_accuracy": 0.6948044300079346, + "num_tokens": 85983991.0, + "step": 3385 + }, + { + "epoch": 0.37184274104985726, + "grad_norm": 1.9621472358703613, + "learning_rate": 1e-06, + "loss": 1.0979, + "mean_token_accuracy": 0.6778242588043213, + "num_tokens": 86020130.0, + "step": 3386 + }, + { + "epoch": 0.3719525587524709, + "grad_norm": 2.5238804817199707, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7095355987548828, + "num_tokens": 86043992.0, + "step": 3387 + }, + { + "epoch": 0.37206237645508455, + "grad_norm": 2.669201374053955, + "learning_rate": 1e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.6999132633209229, + "num_tokens": 86063341.0, + "step": 3388 + }, + { + "epoch": 0.3721721941576982, + "grad_norm": 2.4951910972595215, + "learning_rate": 1e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.7045131921768188, + "num_tokens": 86083900.0, + "step": 3389 + }, + { + "epoch": 0.3722820118603119, + "grad_norm": 2.192970037460327, + "learning_rate": 1e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.7055225372314453, + "num_tokens": 86108895.0, + "step": 3390 + }, + { + "epoch": 0.37239182956292555, + "grad_norm": 2.4119582176208496, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7088594436645508, + "num_tokens": 86131322.0, + "step": 3391 + }, + { + "epoch": 0.3725016472655392, + "grad_norm": 2.1525120735168457, + "learning_rate": 1e-06, + "loss": 1.0129, + "mean_token_accuracy": 0.690902590751648, + "num_tokens": 86160772.0, + "step": 3392 + }, + { + "epoch": 0.37261146496815284, + "grad_norm": 2.3211445808410645, + "learning_rate": 1e-06, + "loss": 1.0956, + "mean_token_accuracy": 0.6777808666229248, + "num_tokens": 86184274.0, + "step": 3393 + }, + { + "epoch": 0.37272128267076654, + "grad_norm": 2.524458885192871, + "learning_rate": 1e-06, + "loss": 0.9132, + "mean_token_accuracy": 0.7151172757148743, + "num_tokens": 86203950.0, + "step": 3394 + }, + { + "epoch": 0.3728311003733802, + "grad_norm": 2.042111396789551, + "learning_rate": 1e-06, + "loss": 1.0935, + "mean_token_accuracy": 0.6746042966842651, + "num_tokens": 86234208.0, + "step": 3395 + }, + { + "epoch": 0.37294091807599383, + "grad_norm": 2.225893974304199, + "learning_rate": 1e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.6975381374359131, + "num_tokens": 86259776.0, + "step": 3396 + }, + { + "epoch": 0.37305073577860753, + "grad_norm": 2.01839017868042, + "learning_rate": 1e-06, + "loss": 1.0246, + "mean_token_accuracy": 0.6914980411529541, + "num_tokens": 86289280.0, + "step": 3397 + }, + { + "epoch": 0.3731605534812212, + "grad_norm": 2.10223388671875, + "learning_rate": 1e-06, + "loss": 0.9708, + "mean_token_accuracy": 0.7101060748100281, + "num_tokens": 86315196.0, + "step": 3398 + }, + { + "epoch": 0.3732703711838348, + "grad_norm": 2.2463021278381348, + "learning_rate": 1e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.7093073129653931, + "num_tokens": 86340103.0, + "step": 3399 + }, + { + "epoch": 0.37338018888644847, + "grad_norm": 2.1187918186187744, + "learning_rate": 1e-06, + "loss": 1.0375, + "mean_token_accuracy": 0.6879991292953491, + "num_tokens": 86369706.0, + "step": 3400 + }, + { + "epoch": 0.3734900065890622, + "grad_norm": 2.0158798694610596, + "learning_rate": 1e-06, + "loss": 1.0321, + "mean_token_accuracy": 0.7012673616409302, + "num_tokens": 86401371.0, + "step": 3401 + }, + { + "epoch": 0.3735998242916758, + "grad_norm": 2.350130558013916, + "learning_rate": 1e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.73139488697052, + "num_tokens": 86424483.0, + "step": 3402 + }, + { + "epoch": 0.37370964199428947, + "grad_norm": 2.146730899810791, + "learning_rate": 1e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.7199950218200684, + "num_tokens": 86448848.0, + "step": 3403 + }, + { + "epoch": 0.37381945969690317, + "grad_norm": 2.075010299682617, + "learning_rate": 1e-06, + "loss": 1.0048, + "mean_token_accuracy": 0.6957657337188721, + "num_tokens": 86478188.0, + "step": 3404 + }, + { + "epoch": 0.3739292773995168, + "grad_norm": 2.1652514934539795, + "learning_rate": 1e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7196721434593201, + "num_tokens": 86506164.0, + "step": 3405 + }, + { + "epoch": 0.37403909510213046, + "grad_norm": 2.463015556335449, + "learning_rate": 1e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.7026246190071106, + "num_tokens": 86527782.0, + "step": 3406 + }, + { + "epoch": 0.3741489128047441, + "grad_norm": 2.1697998046875, + "learning_rate": 1e-06, + "loss": 1.0939, + "mean_token_accuracy": 0.6757839918136597, + "num_tokens": 86558385.0, + "step": 3407 + }, + { + "epoch": 0.3742587305073578, + "grad_norm": 2.019104480743408, + "learning_rate": 1e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.7005569934844971, + "num_tokens": 86588752.0, + "step": 3408 + }, + { + "epoch": 0.37436854820997145, + "grad_norm": 2.0866525173187256, + "learning_rate": 1e-06, + "loss": 1.0893, + "mean_token_accuracy": 0.6751994490623474, + "num_tokens": 86620758.0, + "step": 3409 + }, + { + "epoch": 0.3744783659125851, + "grad_norm": 1.9882469177246094, + "learning_rate": 1e-06, + "loss": 1.0218, + "mean_token_accuracy": 0.6920115947723389, + "num_tokens": 86654848.0, + "step": 3410 + }, + { + "epoch": 0.37458818361519874, + "grad_norm": 2.5139217376708984, + "learning_rate": 1e-06, + "loss": 1.0255, + "mean_token_accuracy": 0.7018814086914062, + "num_tokens": 86675504.0, + "step": 3411 + }, + { + "epoch": 0.37469800131781245, + "grad_norm": 2.3702821731567383, + "learning_rate": 1e-06, + "loss": 0.964, + "mean_token_accuracy": 0.7051401734352112, + "num_tokens": 86698737.0, + "step": 3412 + }, + { + "epoch": 0.3748078190204261, + "grad_norm": 2.0072150230407715, + "learning_rate": 1e-06, + "loss": 0.909, + "mean_token_accuracy": 0.7209345102310181, + "num_tokens": 86728059.0, + "step": 3413 + }, + { + "epoch": 0.37491763672303974, + "grad_norm": 1.9953744411468506, + "learning_rate": 1e-06, + "loss": 1.0185, + "mean_token_accuracy": 0.7003364562988281, + "num_tokens": 86759067.0, + "step": 3414 + }, + { + "epoch": 0.37502745442565344, + "grad_norm": 2.275570869445801, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7158255577087402, + "num_tokens": 86783615.0, + "step": 3415 + }, + { + "epoch": 0.3751372721282671, + "grad_norm": 2.724000930786133, + "learning_rate": 1e-06, + "loss": 0.998, + "mean_token_accuracy": 0.7012336850166321, + "num_tokens": 86804781.0, + "step": 3416 + }, + { + "epoch": 0.37524708983088073, + "grad_norm": 2.1959316730499268, + "learning_rate": 1e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.716105043888092, + "num_tokens": 86828961.0, + "step": 3417 + }, + { + "epoch": 0.3753569075334944, + "grad_norm": 2.4633359909057617, + "learning_rate": 1e-06, + "loss": 1.0148, + "mean_token_accuracy": 0.6956071853637695, + "num_tokens": 86853709.0, + "step": 3418 + }, + { + "epoch": 0.3754667252361081, + "grad_norm": 2.4652812480926514, + "learning_rate": 1e-06, + "loss": 1.0739, + "mean_token_accuracy": 0.6848866939544678, + "num_tokens": 86877632.0, + "step": 3419 + }, + { + "epoch": 0.3755765429387217, + "grad_norm": 2.2853426933288574, + "learning_rate": 1e-06, + "loss": 0.9982, + "mean_token_accuracy": 0.7011694312095642, + "num_tokens": 86900852.0, + "step": 3420 + }, + { + "epoch": 0.37568636064133537, + "grad_norm": 2.197333812713623, + "learning_rate": 1e-06, + "loss": 1.0304, + "mean_token_accuracy": 0.6882522106170654, + "num_tokens": 86926545.0, + "step": 3421 + }, + { + "epoch": 0.37579617834394907, + "grad_norm": 2.309244394302368, + "learning_rate": 1e-06, + "loss": 0.9186, + "mean_token_accuracy": 0.7188270688056946, + "num_tokens": 86951382.0, + "step": 3422 + }, + { + "epoch": 0.3759059960465627, + "grad_norm": 2.0421454906463623, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7114339470863342, + "num_tokens": 86979936.0, + "step": 3423 + }, + { + "epoch": 0.37601581374917636, + "grad_norm": 1.9352784156799316, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.7007703185081482, + "num_tokens": 87009488.0, + "step": 3424 + }, + { + "epoch": 0.37612563145179, + "grad_norm": 2.025359630584717, + "learning_rate": 1e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.6922661066055298, + "num_tokens": 87039950.0, + "step": 3425 + }, + { + "epoch": 0.3762354491544037, + "grad_norm": 2.2137668132781982, + "learning_rate": 1e-06, + "loss": 1.0465, + "mean_token_accuracy": 0.687161386013031, + "num_tokens": 87066645.0, + "step": 3426 + }, + { + "epoch": 0.37634526685701736, + "grad_norm": 2.3263821601867676, + "learning_rate": 1e-06, + "loss": 1.0417, + "mean_token_accuracy": 0.6912544369697571, + "num_tokens": 87091756.0, + "step": 3427 + }, + { + "epoch": 0.376455084559631, + "grad_norm": 2.1774990558624268, + "learning_rate": 1e-06, + "loss": 1.0376, + "mean_token_accuracy": 0.7018835544586182, + "num_tokens": 87118996.0, + "step": 3428 + }, + { + "epoch": 0.37656490226224465, + "grad_norm": 2.4053492546081543, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.7210814356803894, + "num_tokens": 87140559.0, + "step": 3429 + }, + { + "epoch": 0.37667471996485835, + "grad_norm": 2.093771457672119, + "learning_rate": 1e-06, + "loss": 1.0354, + "mean_token_accuracy": 0.684651792049408, + "num_tokens": 87170601.0, + "step": 3430 + }, + { + "epoch": 0.376784537667472, + "grad_norm": 2.026738166809082, + "learning_rate": 1e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.7122364044189453, + "num_tokens": 87200458.0, + "step": 3431 + }, + { + "epoch": 0.37689435537008564, + "grad_norm": 2.5846800804138184, + "learning_rate": 1e-06, + "loss": 0.9352, + "mean_token_accuracy": 0.7091977596282959, + "num_tokens": 87220357.0, + "step": 3432 + }, + { + "epoch": 0.37700417307269934, + "grad_norm": 2.1588521003723145, + "learning_rate": 1e-06, + "loss": 1.0259, + "mean_token_accuracy": 0.691002368927002, + "num_tokens": 87248044.0, + "step": 3433 + }, + { + "epoch": 0.377113990775313, + "grad_norm": 2.324833631515503, + "learning_rate": 1e-06, + "loss": 1.0147, + "mean_token_accuracy": 0.6958782076835632, + "num_tokens": 87270839.0, + "step": 3434 + }, + { + "epoch": 0.37722380847792664, + "grad_norm": 2.435112953186035, + "learning_rate": 1e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.7083263397216797, + "num_tokens": 87292309.0, + "step": 3435 + }, + { + "epoch": 0.3773336261805403, + "grad_norm": 2.3372111320495605, + "learning_rate": 1e-06, + "loss": 1.008, + "mean_token_accuracy": 0.69350665807724, + "num_tokens": 87315015.0, + "step": 3436 + }, + { + "epoch": 0.377443443883154, + "grad_norm": 2.29465651512146, + "learning_rate": 1e-06, + "loss": 0.9657, + "mean_token_accuracy": 0.7116416096687317, + "num_tokens": 87338762.0, + "step": 3437 + }, + { + "epoch": 0.37755326158576763, + "grad_norm": 2.0146117210388184, + "learning_rate": 1e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.7089252471923828, + "num_tokens": 87369224.0, + "step": 3438 + }, + { + "epoch": 0.3776630792883813, + "grad_norm": 2.4814205169677734, + "learning_rate": 1e-06, + "loss": 0.9368, + "mean_token_accuracy": 0.7167015075683594, + "num_tokens": 87391783.0, + "step": 3439 + }, + { + "epoch": 0.3777728969909949, + "grad_norm": 1.883360743522644, + "learning_rate": 1e-06, + "loss": 1.0321, + "mean_token_accuracy": 0.6884774565696716, + "num_tokens": 87424246.0, + "step": 3440 + }, + { + "epoch": 0.3778827146936086, + "grad_norm": 2.2106404304504395, + "learning_rate": 1e-06, + "loss": 1.015, + "mean_token_accuracy": 0.6947300434112549, + "num_tokens": 87452511.0, + "step": 3441 + }, + { + "epoch": 0.37799253239622227, + "grad_norm": 2.4545674324035645, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.698544442653656, + "num_tokens": 87475324.0, + "step": 3442 + }, + { + "epoch": 0.3781023500988359, + "grad_norm": 2.2912933826446533, + "learning_rate": 1e-06, + "loss": 1.0668, + "mean_token_accuracy": 0.6883518695831299, + "num_tokens": 87501207.0, + "step": 3443 + }, + { + "epoch": 0.3782121678014496, + "grad_norm": 2.3891117572784424, + "learning_rate": 1e-06, + "loss": 1.0245, + "mean_token_accuracy": 0.6947675943374634, + "num_tokens": 87524125.0, + "step": 3444 + }, + { + "epoch": 0.37832198550406326, + "grad_norm": 2.1622262001037598, + "learning_rate": 1e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.6960689425468445, + "num_tokens": 87548947.0, + "step": 3445 + }, + { + "epoch": 0.3784318032066769, + "grad_norm": 2.1801252365112305, + "learning_rate": 1e-06, + "loss": 1.0303, + "mean_token_accuracy": 0.686526358127594, + "num_tokens": 87577560.0, + "step": 3446 + }, + { + "epoch": 0.37854162090929055, + "grad_norm": 2.2971954345703125, + "learning_rate": 1e-06, + "loss": 0.9896, + "mean_token_accuracy": 0.6960656642913818, + "num_tokens": 87600693.0, + "step": 3447 + }, + { + "epoch": 0.37865143861190426, + "grad_norm": 2.236508846282959, + "learning_rate": 1e-06, + "loss": 0.9946, + "mean_token_accuracy": 0.6955471038818359, + "num_tokens": 87627639.0, + "step": 3448 + }, + { + "epoch": 0.3787612563145179, + "grad_norm": 2.467066764831543, + "learning_rate": 1e-06, + "loss": 0.8271, + "mean_token_accuracy": 0.7404200434684753, + "num_tokens": 87650666.0, + "step": 3449 + }, + { + "epoch": 0.37887107401713155, + "grad_norm": 2.3544840812683105, + "learning_rate": 1e-06, + "loss": 1.0177, + "mean_token_accuracy": 0.702003538608551, + "num_tokens": 87673726.0, + "step": 3450 + }, + { + "epoch": 0.37898089171974525, + "grad_norm": 2.2493467330932617, + "learning_rate": 1e-06, + "loss": 1.0768, + "mean_token_accuracy": 0.6800109148025513, + "num_tokens": 87698478.0, + "step": 3451 + }, + { + "epoch": 0.3790907094223589, + "grad_norm": 2.007866859436035, + "learning_rate": 1e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.704177975654602, + "num_tokens": 87728645.0, + "step": 3452 + }, + { + "epoch": 0.37920052712497254, + "grad_norm": 2.023420572280884, + "learning_rate": 1e-06, + "loss": 1.0866, + "mean_token_accuracy": 0.6777766942977905, + "num_tokens": 87759552.0, + "step": 3453 + }, + { + "epoch": 0.3793103448275862, + "grad_norm": 2.673381805419922, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.7054646015167236, + "num_tokens": 87780351.0, + "step": 3454 + }, + { + "epoch": 0.3794201625301999, + "grad_norm": 2.4188897609710693, + "learning_rate": 1e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7124675512313843, + "num_tokens": 87802216.0, + "step": 3455 + }, + { + "epoch": 0.37952998023281354, + "grad_norm": 2.412256956100464, + "learning_rate": 1e-06, + "loss": 0.8757, + "mean_token_accuracy": 0.7295180559158325, + "num_tokens": 87823739.0, + "step": 3456 + }, + { + "epoch": 0.3796397979354272, + "grad_norm": 2.54266095161438, + "learning_rate": 1e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.7113617062568665, + "num_tokens": 87844596.0, + "step": 3457 + }, + { + "epoch": 0.3797496156380408, + "grad_norm": 2.3988757133483887, + "learning_rate": 1e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.7012817859649658, + "num_tokens": 87868362.0, + "step": 3458 + }, + { + "epoch": 0.37985943334065453, + "grad_norm": 2.587038278579712, + "learning_rate": 1e-06, + "loss": 1.0187, + "mean_token_accuracy": 0.7055425643920898, + "num_tokens": 87890460.0, + "step": 3459 + }, + { + "epoch": 0.3799692510432682, + "grad_norm": 2.5190987586975098, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7129098773002625, + "num_tokens": 87912939.0, + "step": 3460 + }, + { + "epoch": 0.3800790687458818, + "grad_norm": 2.3250417709350586, + "learning_rate": 1e-06, + "loss": 1.0283, + "mean_token_accuracy": 0.6905642747879028, + "num_tokens": 87940077.0, + "step": 3461 + }, + { + "epoch": 0.3801888864484955, + "grad_norm": 1.9993977546691895, + "learning_rate": 1e-06, + "loss": 1.0661, + "mean_token_accuracy": 0.6799188852310181, + "num_tokens": 87972250.0, + "step": 3462 + }, + { + "epoch": 0.38029870415110917, + "grad_norm": 2.3300044536590576, + "learning_rate": 1e-06, + "loss": 0.9027, + "mean_token_accuracy": 0.7273444533348083, + "num_tokens": 87997126.0, + "step": 3463 + }, + { + "epoch": 0.3804085218537228, + "grad_norm": 2.6456844806671143, + "learning_rate": 1e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.704900860786438, + "num_tokens": 88017530.0, + "step": 3464 + }, + { + "epoch": 0.38051833955633646, + "grad_norm": 2.2722785472869873, + "learning_rate": 1e-06, + "loss": 0.9041, + "mean_token_accuracy": 0.7245153784751892, + "num_tokens": 88044155.0, + "step": 3465 + }, + { + "epoch": 0.38062815725895016, + "grad_norm": 2.2779088020324707, + "learning_rate": 1e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.725845217704773, + "num_tokens": 88067655.0, + "step": 3466 + }, + { + "epoch": 0.3807379749615638, + "grad_norm": 2.0746371746063232, + "learning_rate": 1e-06, + "loss": 1.0253, + "mean_token_accuracy": 0.6843979358673096, + "num_tokens": 88098583.0, + "step": 3467 + }, + { + "epoch": 0.38084779266417745, + "grad_norm": 2.4352481365203857, + "learning_rate": 1e-06, + "loss": 0.9942, + "mean_token_accuracy": 0.7039374709129333, + "num_tokens": 88121988.0, + "step": 3468 + }, + { + "epoch": 0.3809576103667911, + "grad_norm": 2.0826821327209473, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7203106880187988, + "num_tokens": 88148456.0, + "step": 3469 + }, + { + "epoch": 0.3810674280694048, + "grad_norm": 1.923039436340332, + "learning_rate": 1e-06, + "loss": 0.8381, + "mean_token_accuracy": 0.7380921840667725, + "num_tokens": 88175226.0, + "step": 3470 + }, + { + "epoch": 0.38117724577201845, + "grad_norm": 2.2466957569122314, + "learning_rate": 1e-06, + "loss": 0.9227, + "mean_token_accuracy": 0.7144554257392883, + "num_tokens": 88198823.0, + "step": 3471 + }, + { + "epoch": 0.3812870634746321, + "grad_norm": 2.31823992729187, + "learning_rate": 1e-06, + "loss": 1.0164, + "mean_token_accuracy": 0.6934636831283569, + "num_tokens": 88223292.0, + "step": 3472 + }, + { + "epoch": 0.3813968811772458, + "grad_norm": 2.2285187244415283, + "learning_rate": 1e-06, + "loss": 0.8893, + "mean_token_accuracy": 0.7189590930938721, + "num_tokens": 88246281.0, + "step": 3473 + }, + { + "epoch": 0.38150669887985944, + "grad_norm": 2.3903160095214844, + "learning_rate": 1e-06, + "loss": 1.0199, + "mean_token_accuracy": 0.6980285048484802, + "num_tokens": 88270969.0, + "step": 3474 + }, + { + "epoch": 0.3816165165824731, + "grad_norm": 2.0985825061798096, + "learning_rate": 1e-06, + "loss": 1.0662, + "mean_token_accuracy": 0.6862311363220215, + "num_tokens": 88300040.0, + "step": 3475 + }, + { + "epoch": 0.38172633428508673, + "grad_norm": 2.037044048309326, + "learning_rate": 1e-06, + "loss": 1.0542, + "mean_token_accuracy": 0.6887497305870056, + "num_tokens": 88329220.0, + "step": 3476 + }, + { + "epoch": 0.38183615198770043, + "grad_norm": 2.42836856842041, + "learning_rate": 1e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.7017747163772583, + "num_tokens": 88351160.0, + "step": 3477 + }, + { + "epoch": 0.3819459696903141, + "grad_norm": 2.1590545177459717, + "learning_rate": 1e-06, + "loss": 0.9179, + "mean_token_accuracy": 0.7220777273178101, + "num_tokens": 88378113.0, + "step": 3478 + }, + { + "epoch": 0.3820557873929277, + "grad_norm": 2.354175567626953, + "learning_rate": 1e-06, + "loss": 0.8885, + "mean_token_accuracy": 0.7263107895851135, + "num_tokens": 88399937.0, + "step": 3479 + }, + { + "epoch": 0.3821656050955414, + "grad_norm": 2.2048726081848145, + "learning_rate": 1e-06, + "loss": 1.0709, + "mean_token_accuracy": 0.6880965232849121, + "num_tokens": 88425337.0, + "step": 3480 + }, + { + "epoch": 0.3822754227981551, + "grad_norm": 2.0943541526794434, + "learning_rate": 1e-06, + "loss": 0.9603, + "mean_token_accuracy": 0.704364001750946, + "num_tokens": 88453332.0, + "step": 3481 + }, + { + "epoch": 0.3823852405007687, + "grad_norm": 2.3711087703704834, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7057175040245056, + "num_tokens": 88475183.0, + "step": 3482 + }, + { + "epoch": 0.38249505820338237, + "grad_norm": 2.403395652770996, + "learning_rate": 1e-06, + "loss": 1.071, + "mean_token_accuracy": 0.6793497800827026, + "num_tokens": 88499904.0, + "step": 3483 + }, + { + "epoch": 0.38260487590599607, + "grad_norm": 2.0623421669006348, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.7047206163406372, + "num_tokens": 88527480.0, + "step": 3484 + }, + { + "epoch": 0.3827146936086097, + "grad_norm": 2.339892864227295, + "learning_rate": 1e-06, + "loss": 0.9046, + "mean_token_accuracy": 0.722178041934967, + "num_tokens": 88550359.0, + "step": 3485 + }, + { + "epoch": 0.38282451131122336, + "grad_norm": 2.0630578994750977, + "learning_rate": 1e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.6975772976875305, + "num_tokens": 88578546.0, + "step": 3486 + }, + { + "epoch": 0.382934329013837, + "grad_norm": 2.0512757301330566, + "learning_rate": 1e-06, + "loss": 0.9349, + "mean_token_accuracy": 0.7149267196655273, + "num_tokens": 88606686.0, + "step": 3487 + }, + { + "epoch": 0.3830441467164507, + "grad_norm": 2.0601465702056885, + "learning_rate": 1e-06, + "loss": 1.0235, + "mean_token_accuracy": 0.6927703619003296, + "num_tokens": 88636967.0, + "step": 3488 + }, + { + "epoch": 0.38315396441906435, + "grad_norm": 2.2171106338500977, + "learning_rate": 1e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.6994956731796265, + "num_tokens": 88663521.0, + "step": 3489 + }, + { + "epoch": 0.383263782121678, + "grad_norm": 2.149167776107788, + "learning_rate": 1e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.7050307393074036, + "num_tokens": 88689271.0, + "step": 3490 + }, + { + "epoch": 0.3833735998242917, + "grad_norm": 2.351586103439331, + "learning_rate": 1e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.7250463962554932, + "num_tokens": 88711416.0, + "step": 3491 + }, + { + "epoch": 0.38348341752690535, + "grad_norm": 2.0714588165283203, + "learning_rate": 1e-06, + "loss": 0.8992, + "mean_token_accuracy": 0.7291282415390015, + "num_tokens": 88740882.0, + "step": 3492 + }, + { + "epoch": 0.383593235229519, + "grad_norm": 2.25281023979187, + "learning_rate": 1e-06, + "loss": 0.8994, + "mean_token_accuracy": 0.7192108631134033, + "num_tokens": 88765354.0, + "step": 3493 + }, + { + "epoch": 0.38370305293213264, + "grad_norm": 2.081082344055176, + "learning_rate": 1e-06, + "loss": 1.0076, + "mean_token_accuracy": 0.7076855897903442, + "num_tokens": 88795053.0, + "step": 3494 + }, + { + "epoch": 0.38381287063474634, + "grad_norm": 2.065263271331787, + "learning_rate": 1e-06, + "loss": 1.0622, + "mean_token_accuracy": 0.6790255904197693, + "num_tokens": 88825184.0, + "step": 3495 + }, + { + "epoch": 0.38392268833736, + "grad_norm": 2.1709859371185303, + "learning_rate": 1e-06, + "loss": 1.0252, + "mean_token_accuracy": 0.6937474012374878, + "num_tokens": 88850709.0, + "step": 3496 + }, + { + "epoch": 0.38403250603997363, + "grad_norm": 2.3517544269561768, + "learning_rate": 1e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.7027933597564697, + "num_tokens": 88873506.0, + "step": 3497 + }, + { + "epoch": 0.38414232374258733, + "grad_norm": 2.5214834213256836, + "learning_rate": 1e-06, + "loss": 0.8776, + "mean_token_accuracy": 0.7298438549041748, + "num_tokens": 88895672.0, + "step": 3498 + }, + { + "epoch": 0.384252141445201, + "grad_norm": 2.6614465713500977, + "learning_rate": 1e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.7147625684738159, + "num_tokens": 88914176.0, + "step": 3499 + }, + { + "epoch": 0.3843619591478146, + "grad_norm": 2.246821880340576, + "learning_rate": 1e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.7013890743255615, + "num_tokens": 88940091.0, + "step": 3500 + }, + { + "epoch": 0.38447177685042827, + "grad_norm": 2.073427677154541, + "learning_rate": 1e-06, + "loss": 0.9838, + "mean_token_accuracy": 0.71330726146698, + "num_tokens": 88968424.0, + "step": 3501 + }, + { + "epoch": 0.38458159455304197, + "grad_norm": 2.42498517036438, + "learning_rate": 1e-06, + "loss": 1.0742, + "mean_token_accuracy": 0.6805281043052673, + "num_tokens": 88991685.0, + "step": 3502 + }, + { + "epoch": 0.3846914122556556, + "grad_norm": 2.175539493560791, + "learning_rate": 1e-06, + "loss": 0.9323, + "mean_token_accuracy": 0.7175769805908203, + "num_tokens": 89017025.0, + "step": 3503 + }, + { + "epoch": 0.38480122995826926, + "grad_norm": 2.0432796478271484, + "learning_rate": 1e-06, + "loss": 1.0942, + "mean_token_accuracy": 0.6718150973320007, + "num_tokens": 89049714.0, + "step": 3504 + }, + { + "epoch": 0.3849110476608829, + "grad_norm": 2.3065035343170166, + "learning_rate": 1e-06, + "loss": 0.909, + "mean_token_accuracy": 0.7210997343063354, + "num_tokens": 89072896.0, + "step": 3505 + }, + { + "epoch": 0.3850208653634966, + "grad_norm": 2.272503137588501, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.707675576210022, + "num_tokens": 89098063.0, + "step": 3506 + }, + { + "epoch": 0.38513068306611026, + "grad_norm": 2.457719326019287, + "learning_rate": 1e-06, + "loss": 0.8791, + "mean_token_accuracy": 0.7311916351318359, + "num_tokens": 89120209.0, + "step": 3507 + }, + { + "epoch": 0.3852405007687239, + "grad_norm": 2.0863702297210693, + "learning_rate": 1e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7133420705795288, + "num_tokens": 89148819.0, + "step": 3508 + }, + { + "epoch": 0.3853503184713376, + "grad_norm": 2.6924335956573486, + "learning_rate": 1e-06, + "loss": 0.8223, + "mean_token_accuracy": 0.743959903717041, + "num_tokens": 89169333.0, + "step": 3509 + }, + { + "epoch": 0.38546013617395125, + "grad_norm": 2.514343023300171, + "learning_rate": 1e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.7111090421676636, + "num_tokens": 89188755.0, + "step": 3510 + }, + { + "epoch": 0.3855699538765649, + "grad_norm": 2.374101161956787, + "learning_rate": 1e-06, + "loss": 0.9627, + "mean_token_accuracy": 0.7035568356513977, + "num_tokens": 89211318.0, + "step": 3511 + }, + { + "epoch": 0.38567977157917854, + "grad_norm": 2.4316067695617676, + "learning_rate": 1e-06, + "loss": 0.9595, + "mean_token_accuracy": 0.7105631232261658, + "num_tokens": 89232395.0, + "step": 3512 + }, + { + "epoch": 0.38578958928179224, + "grad_norm": 2.3058266639709473, + "learning_rate": 1e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.7103846073150635, + "num_tokens": 89254494.0, + "step": 3513 + }, + { + "epoch": 0.3858994069844059, + "grad_norm": 2.1859700679779053, + "learning_rate": 1e-06, + "loss": 0.9736, + "mean_token_accuracy": 0.6994613409042358, + "num_tokens": 89280428.0, + "step": 3514 + }, + { + "epoch": 0.38600922468701954, + "grad_norm": 2.224672555923462, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.706893801689148, + "num_tokens": 89308539.0, + "step": 3515 + }, + { + "epoch": 0.3861190423896332, + "grad_norm": 2.3571527004241943, + "learning_rate": 1e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7080091238021851, + "num_tokens": 89331182.0, + "step": 3516 + }, + { + "epoch": 0.3862288600922469, + "grad_norm": 2.718630075454712, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.714363694190979, + "num_tokens": 89349943.0, + "step": 3517 + }, + { + "epoch": 0.38633867779486053, + "grad_norm": 2.086970329284668, + "learning_rate": 1e-06, + "loss": 1.0179, + "mean_token_accuracy": 0.694022536277771, + "num_tokens": 89379656.0, + "step": 3518 + }, + { + "epoch": 0.3864484954974742, + "grad_norm": 2.57810115814209, + "learning_rate": 1e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.7120789289474487, + "num_tokens": 89398015.0, + "step": 3519 + }, + { + "epoch": 0.3865583132000879, + "grad_norm": 2.4458534717559814, + "learning_rate": 1e-06, + "loss": 0.856, + "mean_token_accuracy": 0.7341350317001343, + "num_tokens": 89418817.0, + "step": 3520 + }, + { + "epoch": 0.3866681309027015, + "grad_norm": 1.9106014966964722, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.7199349403381348, + "num_tokens": 89452910.0, + "step": 3521 + }, + { + "epoch": 0.38677794860531517, + "grad_norm": 2.451988697052002, + "learning_rate": 1e-06, + "loss": 0.9799, + "mean_token_accuracy": 0.6979101300239563, + "num_tokens": 89475864.0, + "step": 3522 + }, + { + "epoch": 0.3868877663079288, + "grad_norm": 2.260730266571045, + "learning_rate": 1e-06, + "loss": 1.069, + "mean_token_accuracy": 0.6787948608398438, + "num_tokens": 89502018.0, + "step": 3523 + }, + { + "epoch": 0.3869975840105425, + "grad_norm": 2.0563032627105713, + "learning_rate": 1e-06, + "loss": 0.9738, + "mean_token_accuracy": 0.7078534364700317, + "num_tokens": 89532692.0, + "step": 3524 + }, + { + "epoch": 0.38710740171315616, + "grad_norm": 2.3202571868896484, + "learning_rate": 1e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.7037155628204346, + "num_tokens": 89555260.0, + "step": 3525 + }, + { + "epoch": 0.3872172194157698, + "grad_norm": 1.9507834911346436, + "learning_rate": 1e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.7005126476287842, + "num_tokens": 89586510.0, + "step": 3526 + }, + { + "epoch": 0.3873270371183835, + "grad_norm": 1.995145559310913, + "learning_rate": 1e-06, + "loss": 1.1392, + "mean_token_accuracy": 0.6610731482505798, + "num_tokens": 89617521.0, + "step": 3527 + }, + { + "epoch": 0.38743685482099716, + "grad_norm": 2.5444037914276123, + "learning_rate": 1e-06, + "loss": 1.0266, + "mean_token_accuracy": 0.6901000142097473, + "num_tokens": 89638652.0, + "step": 3528 + }, + { + "epoch": 0.3875466725236108, + "grad_norm": 2.537641763687134, + "learning_rate": 1e-06, + "loss": 1.0498, + "mean_token_accuracy": 0.6840572357177734, + "num_tokens": 89660501.0, + "step": 3529 + }, + { + "epoch": 0.38765649022622445, + "grad_norm": 2.385321617126465, + "learning_rate": 1e-06, + "loss": 0.9607, + "mean_token_accuracy": 0.7111868858337402, + "num_tokens": 89684900.0, + "step": 3530 + }, + { + "epoch": 0.38776630792883815, + "grad_norm": 2.236076831817627, + "learning_rate": 1e-06, + "loss": 0.9195, + "mean_token_accuracy": 0.7210693955421448, + "num_tokens": 89711068.0, + "step": 3531 + }, + { + "epoch": 0.3878761256314518, + "grad_norm": 2.5075604915618896, + "learning_rate": 1e-06, + "loss": 1.0366, + "mean_token_accuracy": 0.6881439685821533, + "num_tokens": 89734358.0, + "step": 3532 + }, + { + "epoch": 0.38798594333406544, + "grad_norm": 2.022225856781006, + "learning_rate": 1e-06, + "loss": 1.0657, + "mean_token_accuracy": 0.6847184896469116, + "num_tokens": 89765424.0, + "step": 3533 + }, + { + "epoch": 0.3880957610366791, + "grad_norm": 2.363490581512451, + "learning_rate": 1e-06, + "loss": 0.9143, + "mean_token_accuracy": 0.7242723107337952, + "num_tokens": 89787552.0, + "step": 3534 + }, + { + "epoch": 0.3882055787392928, + "grad_norm": 2.4641754627227783, + "learning_rate": 1e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.7244274616241455, + "num_tokens": 89808301.0, + "step": 3535 + }, + { + "epoch": 0.38831539644190644, + "grad_norm": 2.244457721710205, + "learning_rate": 1e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.7018155455589294, + "num_tokens": 89835111.0, + "step": 3536 + }, + { + "epoch": 0.3884252141445201, + "grad_norm": 2.134918451309204, + "learning_rate": 1e-06, + "loss": 1.0039, + "mean_token_accuracy": 0.7004444599151611, + "num_tokens": 89861288.0, + "step": 3537 + }, + { + "epoch": 0.3885350318471338, + "grad_norm": 2.4275598526000977, + "learning_rate": 1e-06, + "loss": 0.9859, + "mean_token_accuracy": 0.7002764940261841, + "num_tokens": 89883359.0, + "step": 3538 + }, + { + "epoch": 0.38864484954974743, + "grad_norm": 2.2964909076690674, + "learning_rate": 1e-06, + "loss": 0.9987, + "mean_token_accuracy": 0.6958073377609253, + "num_tokens": 89907393.0, + "step": 3539 + }, + { + "epoch": 0.3887546672523611, + "grad_norm": 1.7938436269760132, + "learning_rate": 1e-06, + "loss": 1.1044, + "mean_token_accuracy": 0.673579752445221, + "num_tokens": 89947136.0, + "step": 3540 + }, + { + "epoch": 0.3888644849549747, + "grad_norm": 2.1447980403900146, + "learning_rate": 1e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.7100917100906372, + "num_tokens": 89973991.0, + "step": 3541 + }, + { + "epoch": 0.3889743026575884, + "grad_norm": 1.8337366580963135, + "learning_rate": 1e-06, + "loss": 1.017, + "mean_token_accuracy": 0.689627468585968, + "num_tokens": 90006828.0, + "step": 3542 + }, + { + "epoch": 0.38908412036020207, + "grad_norm": 2.461972951889038, + "learning_rate": 1e-06, + "loss": 0.8923, + "mean_token_accuracy": 0.7219427824020386, + "num_tokens": 90026854.0, + "step": 3543 + }, + { + "epoch": 0.3891939380628157, + "grad_norm": 2.2000043392181396, + "learning_rate": 1e-06, + "loss": 1.0341, + "mean_token_accuracy": 0.6922730803489685, + "num_tokens": 90051663.0, + "step": 3544 + }, + { + "epoch": 0.38930375576542936, + "grad_norm": 2.0655064582824707, + "learning_rate": 1e-06, + "loss": 1.002, + "mean_token_accuracy": 0.6913496255874634, + "num_tokens": 90080175.0, + "step": 3545 + }, + { + "epoch": 0.38941357346804306, + "grad_norm": 2.2523913383483887, + "learning_rate": 1e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.7134597897529602, + "num_tokens": 90104737.0, + "step": 3546 + }, + { + "epoch": 0.3895233911706567, + "grad_norm": 2.0847949981689453, + "learning_rate": 1e-06, + "loss": 1.0742, + "mean_token_accuracy": 0.6792975664138794, + "num_tokens": 90133621.0, + "step": 3547 + }, + { + "epoch": 0.38963320887327035, + "grad_norm": 2.2646894454956055, + "learning_rate": 1e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.7228515148162842, + "num_tokens": 90156973.0, + "step": 3548 + }, + { + "epoch": 0.38974302657588406, + "grad_norm": 2.0917656421661377, + "learning_rate": 1e-06, + "loss": 1.0407, + "mean_token_accuracy": 0.6873276233673096, + "num_tokens": 90186228.0, + "step": 3549 + }, + { + "epoch": 0.3898528442784977, + "grad_norm": 2.2156436443328857, + "learning_rate": 1e-06, + "loss": 0.9915, + "mean_token_accuracy": 0.7032812237739563, + "num_tokens": 90212859.0, + "step": 3550 + }, + { + "epoch": 0.38996266198111135, + "grad_norm": 1.9810014963150024, + "learning_rate": 1e-06, + "loss": 0.9939, + "mean_token_accuracy": 0.69420325756073, + "num_tokens": 90242502.0, + "step": 3551 + }, + { + "epoch": 0.390072479683725, + "grad_norm": 2.116347312927246, + "learning_rate": 1e-06, + "loss": 1.0051, + "mean_token_accuracy": 0.6991373300552368, + "num_tokens": 90269955.0, + "step": 3552 + }, + { + "epoch": 0.3901822973863387, + "grad_norm": 2.560824394226074, + "learning_rate": 1e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.721242368221283, + "num_tokens": 90288707.0, + "step": 3553 + }, + { + "epoch": 0.39029211508895234, + "grad_norm": 2.6885159015655518, + "learning_rate": 1e-06, + "loss": 0.8198, + "mean_token_accuracy": 0.7385980486869812, + "num_tokens": 90307341.0, + "step": 3554 + }, + { + "epoch": 0.390401932791566, + "grad_norm": 2.2103865146636963, + "learning_rate": 1e-06, + "loss": 0.9346, + "mean_token_accuracy": 0.7171945571899414, + "num_tokens": 90332100.0, + "step": 3555 + }, + { + "epoch": 0.3905117504941797, + "grad_norm": 2.272913932800293, + "learning_rate": 1e-06, + "loss": 0.9266, + "mean_token_accuracy": 0.721386194229126, + "num_tokens": 90355401.0, + "step": 3556 + }, + { + "epoch": 0.39062156819679333, + "grad_norm": 2.2940337657928467, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.6934177875518799, + "num_tokens": 90379279.0, + "step": 3557 + }, + { + "epoch": 0.390731385899407, + "grad_norm": 2.388765573501587, + "learning_rate": 1e-06, + "loss": 0.9995, + "mean_token_accuracy": 0.7000513076782227, + "num_tokens": 90401985.0, + "step": 3558 + }, + { + "epoch": 0.3908412036020206, + "grad_norm": 2.024705171585083, + "learning_rate": 1e-06, + "loss": 1.0602, + "mean_token_accuracy": 0.6861668825149536, + "num_tokens": 90432098.0, + "step": 3559 + }, + { + "epoch": 0.3909510213046343, + "grad_norm": 2.1275694370269775, + "learning_rate": 1e-06, + "loss": 1.0663, + "mean_token_accuracy": 0.6778765916824341, + "num_tokens": 90460867.0, + "step": 3560 + }, + { + "epoch": 0.391060839007248, + "grad_norm": 3.127436876296997, + "learning_rate": 1e-06, + "loss": 0.8408, + "mean_token_accuracy": 0.7351836562156677, + "num_tokens": 90476579.0, + "step": 3561 + }, + { + "epoch": 0.3911706567098616, + "grad_norm": 2.3039448261260986, + "learning_rate": 1e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.7048861980438232, + "num_tokens": 90499445.0, + "step": 3562 + }, + { + "epoch": 0.39128047441247527, + "grad_norm": 2.3055434226989746, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.711316704750061, + "num_tokens": 90523392.0, + "step": 3563 + }, + { + "epoch": 0.39139029211508897, + "grad_norm": 2.2980356216430664, + "learning_rate": 1e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7220602035522461, + "num_tokens": 90547576.0, + "step": 3564 + }, + { + "epoch": 0.3915001098177026, + "grad_norm": 2.4541893005371094, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.7069559097290039, + "num_tokens": 90569667.0, + "step": 3565 + }, + { + "epoch": 0.39160992752031626, + "grad_norm": 2.1042685508728027, + "learning_rate": 1e-06, + "loss": 1.0137, + "mean_token_accuracy": 0.6927204132080078, + "num_tokens": 90598209.0, + "step": 3566 + }, + { + "epoch": 0.39171974522292996, + "grad_norm": 2.1282029151916504, + "learning_rate": 1e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.7107222080230713, + "num_tokens": 90625207.0, + "step": 3567 + }, + { + "epoch": 0.3918295629255436, + "grad_norm": 2.022829055786133, + "learning_rate": 1e-06, + "loss": 1.046, + "mean_token_accuracy": 0.6877106428146362, + "num_tokens": 90657319.0, + "step": 3568 + }, + { + "epoch": 0.39193938062815725, + "grad_norm": 2.2379212379455566, + "learning_rate": 1e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.7188760638237, + "num_tokens": 90681080.0, + "step": 3569 + }, + { + "epoch": 0.3920491983307709, + "grad_norm": 2.37045955657959, + "learning_rate": 1e-06, + "loss": 0.9945, + "mean_token_accuracy": 0.6910734176635742, + "num_tokens": 90703451.0, + "step": 3570 + }, + { + "epoch": 0.3921590160333846, + "grad_norm": 2.6543729305267334, + "learning_rate": 1e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.7125128507614136, + "num_tokens": 90722395.0, + "step": 3571 + }, + { + "epoch": 0.39226883373599825, + "grad_norm": 2.2330992221832275, + "learning_rate": 1e-06, + "loss": 0.9847, + "mean_token_accuracy": 0.7035635709762573, + "num_tokens": 90747159.0, + "step": 3572 + }, + { + "epoch": 0.3923786514386119, + "grad_norm": 2.5003697872161865, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7058863639831543, + "num_tokens": 90769313.0, + "step": 3573 + }, + { + "epoch": 0.3924884691412256, + "grad_norm": 2.578425168991089, + "learning_rate": 1e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7121277451515198, + "num_tokens": 90788574.0, + "step": 3574 + }, + { + "epoch": 0.39259828684383924, + "grad_norm": 2.3715474605560303, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.7020009756088257, + "num_tokens": 90811941.0, + "step": 3575 + }, + { + "epoch": 0.3927081045464529, + "grad_norm": 2.601555347442627, + "learning_rate": 1e-06, + "loss": 0.8883, + "mean_token_accuracy": 0.7276142835617065, + "num_tokens": 90830825.0, + "step": 3576 + }, + { + "epoch": 0.39281792224906653, + "grad_norm": 2.4863569736480713, + "learning_rate": 1e-06, + "loss": 0.9894, + "mean_token_accuracy": 0.6979659795761108, + "num_tokens": 90852975.0, + "step": 3577 + }, + { + "epoch": 0.39292773995168023, + "grad_norm": 2.2905094623565674, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.6956533193588257, + "num_tokens": 90880208.0, + "step": 3578 + }, + { + "epoch": 0.3930375576542939, + "grad_norm": 1.9676214456558228, + "learning_rate": 1e-06, + "loss": 0.9738, + "mean_token_accuracy": 0.6949790716171265, + "num_tokens": 90910618.0, + "step": 3579 + }, + { + "epoch": 0.3931473753569075, + "grad_norm": 2.115149974822998, + "learning_rate": 1e-06, + "loss": 0.9837, + "mean_token_accuracy": 0.6977096796035767, + "num_tokens": 90939707.0, + "step": 3580 + }, + { + "epoch": 0.39325719305952117, + "grad_norm": 2.281548023223877, + "learning_rate": 1e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.7089052796363831, + "num_tokens": 90965463.0, + "step": 3581 + }, + { + "epoch": 0.39336701076213487, + "grad_norm": 2.3487050533294678, + "learning_rate": 1e-06, + "loss": 0.9847, + "mean_token_accuracy": 0.699158787727356, + "num_tokens": 90989272.0, + "step": 3582 + }, + { + "epoch": 0.3934768284647485, + "grad_norm": 2.2268598079681396, + "learning_rate": 1e-06, + "loss": 0.989, + "mean_token_accuracy": 0.702934980392456, + "num_tokens": 91013523.0, + "step": 3583 + }, + { + "epoch": 0.39358664616736216, + "grad_norm": 2.0767648220062256, + "learning_rate": 1e-06, + "loss": 1.0463, + "mean_token_accuracy": 0.6916062831878662, + "num_tokens": 91041521.0, + "step": 3584 + }, + { + "epoch": 0.39369646386997587, + "grad_norm": 2.187932014465332, + "learning_rate": 1e-06, + "loss": 1.0547, + "mean_token_accuracy": 0.6811190843582153, + "num_tokens": 91068597.0, + "step": 3585 + }, + { + "epoch": 0.3938062815725895, + "grad_norm": 2.334624767303467, + "learning_rate": 1e-06, + "loss": 1.0853, + "mean_token_accuracy": 0.6735765337944031, + "num_tokens": 91093749.0, + "step": 3586 + }, + { + "epoch": 0.39391609927520316, + "grad_norm": 2.068060874938965, + "learning_rate": 1e-06, + "loss": 0.99, + "mean_token_accuracy": 0.703633189201355, + "num_tokens": 91122116.0, + "step": 3587 + }, + { + "epoch": 0.3940259169778168, + "grad_norm": 2.076554298400879, + "learning_rate": 1e-06, + "loss": 0.9915, + "mean_token_accuracy": 0.6943674683570862, + "num_tokens": 91153765.0, + "step": 3588 + }, + { + "epoch": 0.3941357346804305, + "grad_norm": 2.0752949714660645, + "learning_rate": 1e-06, + "loss": 0.943, + "mean_token_accuracy": 0.7084647417068481, + "num_tokens": 91181588.0, + "step": 3589 + }, + { + "epoch": 0.39424555238304415, + "grad_norm": 2.047272205352783, + "learning_rate": 1e-06, + "loss": 1.013, + "mean_token_accuracy": 0.6920095682144165, + "num_tokens": 91208426.0, + "step": 3590 + }, + { + "epoch": 0.3943553700856578, + "grad_norm": 2.236693859100342, + "learning_rate": 1e-06, + "loss": 0.972, + "mean_token_accuracy": 0.7034416198730469, + "num_tokens": 91233808.0, + "step": 3591 + }, + { + "epoch": 0.39446518778827144, + "grad_norm": 2.038532018661499, + "learning_rate": 1e-06, + "loss": 0.9916, + "mean_token_accuracy": 0.7023049592971802, + "num_tokens": 91263936.0, + "step": 3592 + }, + { + "epoch": 0.39457500549088514, + "grad_norm": 2.2938170433044434, + "learning_rate": 1e-06, + "loss": 1.0103, + "mean_token_accuracy": 0.6915003657341003, + "num_tokens": 91286504.0, + "step": 3593 + }, + { + "epoch": 0.3946848231934988, + "grad_norm": 2.2516489028930664, + "learning_rate": 1e-06, + "loss": 1.0024, + "mean_token_accuracy": 0.6984468698501587, + "num_tokens": 91313299.0, + "step": 3594 + }, + { + "epoch": 0.39479464089611244, + "grad_norm": 2.235264539718628, + "learning_rate": 1e-06, + "loss": 1.032, + "mean_token_accuracy": 0.6932334899902344, + "num_tokens": 91339516.0, + "step": 3595 + }, + { + "epoch": 0.39490445859872614, + "grad_norm": 2.035954475402832, + "learning_rate": 1e-06, + "loss": 1.109, + "mean_token_accuracy": 0.665902853012085, + "num_tokens": 91372466.0, + "step": 3596 + }, + { + "epoch": 0.3950142763013398, + "grad_norm": 2.0228729248046875, + "learning_rate": 1e-06, + "loss": 1.0022, + "mean_token_accuracy": 0.69615638256073, + "num_tokens": 91402852.0, + "step": 3597 + }, + { + "epoch": 0.39512409400395343, + "grad_norm": 1.9446567296981812, + "learning_rate": 1e-06, + "loss": 1.0372, + "mean_token_accuracy": 0.6866823434829712, + "num_tokens": 91433358.0, + "step": 3598 + }, + { + "epoch": 0.3952339117065671, + "grad_norm": 2.1913397312164307, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7114058136940002, + "num_tokens": 91458871.0, + "step": 3599 + }, + { + "epoch": 0.3953437294091808, + "grad_norm": 2.197662115097046, + "learning_rate": 1e-06, + "loss": 1.0698, + "mean_token_accuracy": 0.6809353828430176, + "num_tokens": 91486960.0, + "step": 3600 + }, + { + "epoch": 0.3954535471117944, + "grad_norm": 2.2880852222442627, + "learning_rate": 1e-06, + "loss": 0.9455, + "mean_token_accuracy": 0.7112219929695129, + "num_tokens": 91512932.0, + "step": 3601 + }, + { + "epoch": 0.39556336481440807, + "grad_norm": 2.2753894329071045, + "learning_rate": 1e-06, + "loss": 1.03, + "mean_token_accuracy": 0.6885800361633301, + "num_tokens": 91540414.0, + "step": 3602 + }, + { + "epoch": 0.39567318251702177, + "grad_norm": 2.2012367248535156, + "learning_rate": 1e-06, + "loss": 0.9933, + "mean_token_accuracy": 0.700036346912384, + "num_tokens": 91566460.0, + "step": 3603 + }, + { + "epoch": 0.3957830002196354, + "grad_norm": 2.4137444496154785, + "learning_rate": 1e-06, + "loss": 1.0272, + "mean_token_accuracy": 0.6885025501251221, + "num_tokens": 91589562.0, + "step": 3604 + }, + { + "epoch": 0.39589281792224906, + "grad_norm": 2.115694522857666, + "learning_rate": 1e-06, + "loss": 1.0102, + "mean_token_accuracy": 0.6896399259567261, + "num_tokens": 91618014.0, + "step": 3605 + }, + { + "epoch": 0.3960026356248627, + "grad_norm": 2.2809410095214844, + "learning_rate": 1e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.6935425996780396, + "num_tokens": 91642207.0, + "step": 3606 + }, + { + "epoch": 0.3961124533274764, + "grad_norm": 2.497997283935547, + "learning_rate": 1e-06, + "loss": 1.0693, + "mean_token_accuracy": 0.6808248162269592, + "num_tokens": 91662601.0, + "step": 3607 + }, + { + "epoch": 0.39622227103009006, + "grad_norm": 2.4145522117614746, + "learning_rate": 1e-06, + "loss": 0.9806, + "mean_token_accuracy": 0.6925379037857056, + "num_tokens": 91683348.0, + "step": 3608 + }, + { + "epoch": 0.3963320887327037, + "grad_norm": 2.279672622680664, + "learning_rate": 1e-06, + "loss": 0.9088, + "mean_token_accuracy": 0.714267373085022, + "num_tokens": 91708289.0, + "step": 3609 + }, + { + "epoch": 0.39644190643531735, + "grad_norm": 2.20967173576355, + "learning_rate": 1e-06, + "loss": 1.0413, + "mean_token_accuracy": 0.6849019527435303, + "num_tokens": 91733852.0, + "step": 3610 + }, + { + "epoch": 0.39655172413793105, + "grad_norm": 2.5041489601135254, + "learning_rate": 1e-06, + "loss": 0.948, + "mean_token_accuracy": 0.707102358341217, + "num_tokens": 91753629.0, + "step": 3611 + }, + { + "epoch": 0.3966615418405447, + "grad_norm": 2.099881410598755, + "learning_rate": 1e-06, + "loss": 0.9872, + "mean_token_accuracy": 0.7083752751350403, + "num_tokens": 91779379.0, + "step": 3612 + }, + { + "epoch": 0.39677135954315834, + "grad_norm": 2.0781190395355225, + "learning_rate": 1e-06, + "loss": 1.053, + "mean_token_accuracy": 0.6928837299346924, + "num_tokens": 91808104.0, + "step": 3613 + }, + { + "epoch": 0.39688117724577204, + "grad_norm": 2.2606468200683594, + "learning_rate": 1e-06, + "loss": 1.0684, + "mean_token_accuracy": 0.6846924424171448, + "num_tokens": 91836815.0, + "step": 3614 + }, + { + "epoch": 0.3969909949483857, + "grad_norm": 1.9606895446777344, + "learning_rate": 1e-06, + "loss": 1.0438, + "mean_token_accuracy": 0.6819628477096558, + "num_tokens": 91869624.0, + "step": 3615 + }, + { + "epoch": 0.39710081265099934, + "grad_norm": 2.267369270324707, + "learning_rate": 1e-06, + "loss": 1.0521, + "mean_token_accuracy": 0.6856378316879272, + "num_tokens": 91895624.0, + "step": 3616 + }, + { + "epoch": 0.397210630353613, + "grad_norm": 2.0321972370147705, + "learning_rate": 1e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.7046226263046265, + "num_tokens": 91926638.0, + "step": 3617 + }, + { + "epoch": 0.3973204480562267, + "grad_norm": 2.1902756690979004, + "learning_rate": 1e-06, + "loss": 1.0791, + "mean_token_accuracy": 0.6830896139144897, + "num_tokens": 91953663.0, + "step": 3618 + }, + { + "epoch": 0.39743026575884033, + "grad_norm": 2.1083076000213623, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7156449556350708, + "num_tokens": 91979285.0, + "step": 3619 + }, + { + "epoch": 0.397540083461454, + "grad_norm": 2.0733072757720947, + "learning_rate": 1e-06, + "loss": 1.0981, + "mean_token_accuracy": 0.6780250072479248, + "num_tokens": 92009642.0, + "step": 3620 + }, + { + "epoch": 0.3976499011640676, + "grad_norm": 2.2820358276367188, + "learning_rate": 1e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.7223371863365173, + "num_tokens": 92030079.0, + "step": 3621 + }, + { + "epoch": 0.3977597188666813, + "grad_norm": 2.3713409900665283, + "learning_rate": 1e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.7107138633728027, + "num_tokens": 92051803.0, + "step": 3622 + }, + { + "epoch": 0.39786953656929497, + "grad_norm": 2.2666068077087402, + "learning_rate": 1e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.7102502584457397, + "num_tokens": 92075818.0, + "step": 3623 + }, + { + "epoch": 0.3979793542719086, + "grad_norm": 2.388986349105835, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.7038017511367798, + "num_tokens": 92098358.0, + "step": 3624 + }, + { + "epoch": 0.3980891719745223, + "grad_norm": 2.020277738571167, + "learning_rate": 1e-06, + "loss": 0.9931, + "mean_token_accuracy": 0.6974835395812988, + "num_tokens": 92130002.0, + "step": 3625 + }, + { + "epoch": 0.39819898967713596, + "grad_norm": 1.9614157676696777, + "learning_rate": 1e-06, + "loss": 1.0528, + "mean_token_accuracy": 0.6900773644447327, + "num_tokens": 92162143.0, + "step": 3626 + }, + { + "epoch": 0.3983088073797496, + "grad_norm": 2.104497194290161, + "learning_rate": 1e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.7165186405181885, + "num_tokens": 92189056.0, + "step": 3627 + }, + { + "epoch": 0.39841862508236325, + "grad_norm": 2.21821928024292, + "learning_rate": 1e-06, + "loss": 0.8417, + "mean_token_accuracy": 0.7377954125404358, + "num_tokens": 92212928.0, + "step": 3628 + }, + { + "epoch": 0.39852844278497696, + "grad_norm": 2.126373291015625, + "learning_rate": 1e-06, + "loss": 1.0067, + "mean_token_accuracy": 0.7005906105041504, + "num_tokens": 92239178.0, + "step": 3629 + }, + { + "epoch": 0.3986382604875906, + "grad_norm": 2.1190459728240967, + "learning_rate": 1e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.7120765447616577, + "num_tokens": 92266930.0, + "step": 3630 + }, + { + "epoch": 0.39874807819020425, + "grad_norm": 2.2195982933044434, + "learning_rate": 1e-06, + "loss": 0.9964, + "mean_token_accuracy": 0.6995055079460144, + "num_tokens": 92293427.0, + "step": 3631 + }, + { + "epoch": 0.39885789589281795, + "grad_norm": 2.155117988586426, + "learning_rate": 1e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.697596549987793, + "num_tokens": 92319985.0, + "step": 3632 + }, + { + "epoch": 0.3989677135954316, + "grad_norm": 2.6030828952789307, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.6997003555297852, + "num_tokens": 92339691.0, + "step": 3633 + }, + { + "epoch": 0.39907753129804524, + "grad_norm": 2.179476737976074, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.7062469124794006, + "num_tokens": 92364040.0, + "step": 3634 + }, + { + "epoch": 0.3991873490006589, + "grad_norm": 1.983761191368103, + "learning_rate": 1e-06, + "loss": 1.0263, + "mean_token_accuracy": 0.7003129720687866, + "num_tokens": 92393639.0, + "step": 3635 + }, + { + "epoch": 0.3992971667032726, + "grad_norm": 2.5604333877563477, + "learning_rate": 1e-06, + "loss": 0.8993, + "mean_token_accuracy": 0.7218772172927856, + "num_tokens": 92412974.0, + "step": 3636 + }, + { + "epoch": 0.39940698440588623, + "grad_norm": 4.832563400268555, + "learning_rate": 1e-06, + "loss": 1.0597, + "mean_token_accuracy": 0.6883867979049683, + "num_tokens": 92446022.0, + "step": 3637 + }, + { + "epoch": 0.3995168021084999, + "grad_norm": 2.105250597000122, + "learning_rate": 1e-06, + "loss": 1.0378, + "mean_token_accuracy": 0.6897850036621094, + "num_tokens": 92475598.0, + "step": 3638 + }, + { + "epoch": 0.3996266198111135, + "grad_norm": 2.334937334060669, + "learning_rate": 1e-06, + "loss": 0.9498, + "mean_token_accuracy": 0.7083648443222046, + "num_tokens": 92498045.0, + "step": 3639 + }, + { + "epoch": 0.3997364375137272, + "grad_norm": 2.0755615234375, + "learning_rate": 1e-06, + "loss": 1.0329, + "mean_token_accuracy": 0.697676956653595, + "num_tokens": 92527082.0, + "step": 3640 + }, + { + "epoch": 0.3998462552163409, + "grad_norm": 2.073406934738159, + "learning_rate": 1e-06, + "loss": 0.9242, + "mean_token_accuracy": 0.7133957147598267, + "num_tokens": 92554949.0, + "step": 3641 + }, + { + "epoch": 0.3999560729189545, + "grad_norm": 2.2437243461608887, + "learning_rate": 1e-06, + "loss": 0.9206, + "mean_token_accuracy": 0.7179508209228516, + "num_tokens": 92578902.0, + "step": 3642 + }, + { + "epoch": 0.4000658906215682, + "grad_norm": 1.9896022081375122, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7192704677581787, + "num_tokens": 92608603.0, + "step": 3643 + }, + { + "epoch": 0.40017570832418187, + "grad_norm": 2.082886219024658, + "learning_rate": 1e-06, + "loss": 0.9858, + "mean_token_accuracy": 0.6991896629333496, + "num_tokens": 92637224.0, + "step": 3644 + }, + { + "epoch": 0.4002855260267955, + "grad_norm": 1.9005500078201294, + "learning_rate": 1e-06, + "loss": 1.1288, + "mean_token_accuracy": 0.6705142259597778, + "num_tokens": 92672232.0, + "step": 3645 + }, + { + "epoch": 0.40039534372940916, + "grad_norm": 2.377007484436035, + "learning_rate": 1e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.7098718881607056, + "num_tokens": 92694209.0, + "step": 3646 + }, + { + "epoch": 0.40050516143202286, + "grad_norm": 2.2545666694641113, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.7117156982421875, + "num_tokens": 92719320.0, + "step": 3647 + }, + { + "epoch": 0.4006149791346365, + "grad_norm": 2.103053569793701, + "learning_rate": 1e-06, + "loss": 1.04, + "mean_token_accuracy": 0.6861852407455444, + "num_tokens": 92747275.0, + "step": 3648 + }, + { + "epoch": 0.40072479683725015, + "grad_norm": 2.1907155513763428, + "learning_rate": 1e-06, + "loss": 1.0228, + "mean_token_accuracy": 0.683224081993103, + "num_tokens": 92772983.0, + "step": 3649 + }, + { + "epoch": 0.4008346145398638, + "grad_norm": 2.5067481994628906, + "learning_rate": 1e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.693427324295044, + "num_tokens": 92793078.0, + "step": 3650 + }, + { + "epoch": 0.4009444322424775, + "grad_norm": 2.280975103378296, + "learning_rate": 1e-06, + "loss": 1.0396, + "mean_token_accuracy": 0.6869654059410095, + "num_tokens": 92816775.0, + "step": 3651 + }, + { + "epoch": 0.40105424994509115, + "grad_norm": 2.1051151752471924, + "learning_rate": 1e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.7008606195449829, + "num_tokens": 92844140.0, + "step": 3652 + }, + { + "epoch": 0.4011640676477048, + "grad_norm": 2.1799793243408203, + "learning_rate": 1e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.7203599810600281, + "num_tokens": 92870116.0, + "step": 3653 + }, + { + "epoch": 0.4012738853503185, + "grad_norm": 2.202625036239624, + "learning_rate": 1e-06, + "loss": 1.0012, + "mean_token_accuracy": 0.7017656564712524, + "num_tokens": 92899016.0, + "step": 3654 + }, + { + "epoch": 0.40138370305293214, + "grad_norm": 2.2744619846343994, + "learning_rate": 1e-06, + "loss": 1.042, + "mean_token_accuracy": 0.6944336295127869, + "num_tokens": 92923126.0, + "step": 3655 + }, + { + "epoch": 0.4014935207555458, + "grad_norm": 2.221170425415039, + "learning_rate": 1e-06, + "loss": 0.9565, + "mean_token_accuracy": 0.709038496017456, + "num_tokens": 92946995.0, + "step": 3656 + }, + { + "epoch": 0.40160333845815943, + "grad_norm": 2.231612205505371, + "learning_rate": 1e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.7115311622619629, + "num_tokens": 92971470.0, + "step": 3657 + }, + { + "epoch": 0.40171315616077313, + "grad_norm": 2.117144823074341, + "learning_rate": 1e-06, + "loss": 1.0578, + "mean_token_accuracy": 0.6835165023803711, + "num_tokens": 92999834.0, + "step": 3658 + }, + { + "epoch": 0.4018229738633868, + "grad_norm": 2.313852071762085, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.7175965309143066, + "num_tokens": 93024787.0, + "step": 3659 + }, + { + "epoch": 0.4019327915660004, + "grad_norm": 2.348539352416992, + "learning_rate": 1e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.7086062431335449, + "num_tokens": 93050340.0, + "step": 3660 + }, + { + "epoch": 0.4020426092686141, + "grad_norm": 1.951532244682312, + "learning_rate": 1e-06, + "loss": 1.0085, + "mean_token_accuracy": 0.6988546848297119, + "num_tokens": 93081650.0, + "step": 3661 + }, + { + "epoch": 0.40215242697122777, + "grad_norm": 2.307626724243164, + "learning_rate": 1e-06, + "loss": 0.9585, + "mean_token_accuracy": 0.7175441384315491, + "num_tokens": 93104891.0, + "step": 3662 + }, + { + "epoch": 0.4022622446738414, + "grad_norm": 2.574099540710449, + "learning_rate": 1e-06, + "loss": 0.8789, + "mean_token_accuracy": 0.7244969606399536, + "num_tokens": 93123947.0, + "step": 3663 + }, + { + "epoch": 0.40237206237645506, + "grad_norm": 2.476583480834961, + "learning_rate": 1e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.705346941947937, + "num_tokens": 93144004.0, + "step": 3664 + }, + { + "epoch": 0.40248188007906877, + "grad_norm": 2.570211410522461, + "learning_rate": 1e-06, + "loss": 0.9368, + "mean_token_accuracy": 0.7136305570602417, + "num_tokens": 93163308.0, + "step": 3665 + }, + { + "epoch": 0.4025916977816824, + "grad_norm": 2.0042779445648193, + "learning_rate": 1e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.6927090883255005, + "num_tokens": 93198929.0, + "step": 3666 + }, + { + "epoch": 0.40270151548429606, + "grad_norm": 2.7301228046417236, + "learning_rate": 1e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.6945552825927734, + "num_tokens": 93217742.0, + "step": 3667 + }, + { + "epoch": 0.4028113331869097, + "grad_norm": 2.0664305686950684, + "learning_rate": 1e-06, + "loss": 0.9975, + "mean_token_accuracy": 0.6974730491638184, + "num_tokens": 93247424.0, + "step": 3668 + }, + { + "epoch": 0.4029211508895234, + "grad_norm": 2.143444776535034, + "learning_rate": 1e-06, + "loss": 1.027, + "mean_token_accuracy": 0.6879360675811768, + "num_tokens": 93273423.0, + "step": 3669 + }, + { + "epoch": 0.40303096859213705, + "grad_norm": 1.8949233293533325, + "learning_rate": 1e-06, + "loss": 1.061, + "mean_token_accuracy": 0.6727056503295898, + "num_tokens": 93306447.0, + "step": 3670 + }, + { + "epoch": 0.4031407862947507, + "grad_norm": 2.251215934753418, + "learning_rate": 1e-06, + "loss": 0.9544, + "mean_token_accuracy": 0.7222629189491272, + "num_tokens": 93329840.0, + "step": 3671 + }, + { + "epoch": 0.4032506039973644, + "grad_norm": 2.034419059753418, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.705358624458313, + "num_tokens": 93359981.0, + "step": 3672 + }, + { + "epoch": 0.40336042169997804, + "grad_norm": 2.1998794078826904, + "learning_rate": 1e-06, + "loss": 1.0492, + "mean_token_accuracy": 0.6853736639022827, + "num_tokens": 93388516.0, + "step": 3673 + }, + { + "epoch": 0.4034702394025917, + "grad_norm": 2.0955257415771484, + "learning_rate": 1e-06, + "loss": 0.9261, + "mean_token_accuracy": 0.7091494798660278, + "num_tokens": 93414823.0, + "step": 3674 + }, + { + "epoch": 0.40358005710520534, + "grad_norm": 2.5126614570617676, + "learning_rate": 1e-06, + "loss": 1.0129, + "mean_token_accuracy": 0.6960285902023315, + "num_tokens": 93436210.0, + "step": 3675 + }, + { + "epoch": 0.40368987480781904, + "grad_norm": 2.5060276985168457, + "learning_rate": 1e-06, + "loss": 0.9773, + "mean_token_accuracy": 0.7010613679885864, + "num_tokens": 93459871.0, + "step": 3676 + }, + { + "epoch": 0.4037996925104327, + "grad_norm": 2.332894802093506, + "learning_rate": 1e-06, + "loss": 1.1132, + "mean_token_accuracy": 0.6722242832183838, + "num_tokens": 93483316.0, + "step": 3677 + }, + { + "epoch": 0.40390951021304633, + "grad_norm": 2.5017573833465576, + "learning_rate": 1e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.7203088998794556, + "num_tokens": 93504333.0, + "step": 3678 + }, + { + "epoch": 0.40401932791566003, + "grad_norm": 1.8679355382919312, + "learning_rate": 1e-06, + "loss": 1.05, + "mean_token_accuracy": 0.6842070817947388, + "num_tokens": 93540252.0, + "step": 3679 + }, + { + "epoch": 0.4041291456182737, + "grad_norm": 2.1221256256103516, + "learning_rate": 1e-06, + "loss": 1.0307, + "mean_token_accuracy": 0.6869562864303589, + "num_tokens": 93569587.0, + "step": 3680 + }, + { + "epoch": 0.4042389633208873, + "grad_norm": 2.254441738128662, + "learning_rate": 1e-06, + "loss": 0.975, + "mean_token_accuracy": 0.7055093050003052, + "num_tokens": 93592793.0, + "step": 3681 + }, + { + "epoch": 0.40434878102350097, + "grad_norm": 2.210728406906128, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7068209648132324, + "num_tokens": 93617143.0, + "step": 3682 + }, + { + "epoch": 0.40445859872611467, + "grad_norm": 2.152100086212158, + "learning_rate": 1e-06, + "loss": 1.0145, + "mean_token_accuracy": 0.6953835487365723, + "num_tokens": 93643035.0, + "step": 3683 + }, + { + "epoch": 0.4045684164287283, + "grad_norm": 2.3300726413726807, + "learning_rate": 1e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.7071135640144348, + "num_tokens": 93665499.0, + "step": 3684 + }, + { + "epoch": 0.40467823413134196, + "grad_norm": 1.9659438133239746, + "learning_rate": 1e-06, + "loss": 1.0095, + "mean_token_accuracy": 0.6949440836906433, + "num_tokens": 93698107.0, + "step": 3685 + }, + { + "epoch": 0.4047880518339556, + "grad_norm": 2.3988192081451416, + "learning_rate": 1e-06, + "loss": 1.0861, + "mean_token_accuracy": 0.6813757419586182, + "num_tokens": 93723699.0, + "step": 3686 + }, + { + "epoch": 0.4048978695365693, + "grad_norm": 2.4273416996002197, + "learning_rate": 1e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.6994380354881287, + "num_tokens": 93745435.0, + "step": 3687 + }, + { + "epoch": 0.40500768723918296, + "grad_norm": 2.163219928741455, + "learning_rate": 1e-06, + "loss": 0.9808, + "mean_token_accuracy": 0.6931079030036926, + "num_tokens": 93771793.0, + "step": 3688 + }, + { + "epoch": 0.4051175049417966, + "grad_norm": 2.162733316421509, + "learning_rate": 1e-06, + "loss": 0.8643, + "mean_token_accuracy": 0.7355983853340149, + "num_tokens": 93797917.0, + "step": 3689 + }, + { + "epoch": 0.4052273226444103, + "grad_norm": 2.3350555896759033, + "learning_rate": 1e-06, + "loss": 0.9857, + "mean_token_accuracy": 0.7026491761207581, + "num_tokens": 93820898.0, + "step": 3690 + }, + { + "epoch": 0.40533714034702395, + "grad_norm": 2.292855739593506, + "learning_rate": 1e-06, + "loss": 0.895, + "mean_token_accuracy": 0.7216658592224121, + "num_tokens": 93845056.0, + "step": 3691 + }, + { + "epoch": 0.4054469580496376, + "grad_norm": 2.831322431564331, + "learning_rate": 1e-06, + "loss": 0.8686, + "mean_token_accuracy": 0.7228773832321167, + "num_tokens": 93860592.0, + "step": 3692 + }, + { + "epoch": 0.40555677575225124, + "grad_norm": 2.0332424640655518, + "learning_rate": 1e-06, + "loss": 1.0293, + "mean_token_accuracy": 0.690031886100769, + "num_tokens": 93890282.0, + "step": 3693 + }, + { + "epoch": 0.40566659345486494, + "grad_norm": 2.418421983718872, + "learning_rate": 1e-06, + "loss": 0.8911, + "mean_token_accuracy": 0.7266239523887634, + "num_tokens": 93912152.0, + "step": 3694 + }, + { + "epoch": 0.4057764111574786, + "grad_norm": 2.182157039642334, + "learning_rate": 1e-06, + "loss": 1.0001, + "mean_token_accuracy": 0.6962299346923828, + "num_tokens": 93938498.0, + "step": 3695 + }, + { + "epoch": 0.40588622886009224, + "grad_norm": 2.1983704566955566, + "learning_rate": 1e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7051037549972534, + "num_tokens": 93962224.0, + "step": 3696 + }, + { + "epoch": 0.4059960465627059, + "grad_norm": 2.253190040588379, + "learning_rate": 1e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.6984356641769409, + "num_tokens": 93986307.0, + "step": 3697 + }, + { + "epoch": 0.4061058642653196, + "grad_norm": 2.321707248687744, + "learning_rate": 1e-06, + "loss": 1.0238, + "mean_token_accuracy": 0.6959547996520996, + "num_tokens": 94011139.0, + "step": 3698 + }, + { + "epoch": 0.40621568196793323, + "grad_norm": 2.417288064956665, + "learning_rate": 1e-06, + "loss": 1.0994, + "mean_token_accuracy": 0.6798391342163086, + "num_tokens": 94036339.0, + "step": 3699 + }, + { + "epoch": 0.4063254996705469, + "grad_norm": 2.2522716522216797, + "learning_rate": 1e-06, + "loss": 0.9348, + "mean_token_accuracy": 0.7176848649978638, + "num_tokens": 94061209.0, + "step": 3700 + }, + { + "epoch": 0.4064353173731606, + "grad_norm": 2.264631748199463, + "learning_rate": 1e-06, + "loss": 1.0431, + "mean_token_accuracy": 0.6818230152130127, + "num_tokens": 94087025.0, + "step": 3701 + }, + { + "epoch": 0.4065451350757742, + "grad_norm": 2.3258540630340576, + "learning_rate": 1e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.7283857464790344, + "num_tokens": 94109818.0, + "step": 3702 + }, + { + "epoch": 0.40665495277838787, + "grad_norm": 2.408378839492798, + "learning_rate": 1e-06, + "loss": 1.0296, + "mean_token_accuracy": 0.6973381042480469, + "num_tokens": 94133394.0, + "step": 3703 + }, + { + "epoch": 0.4067647704810015, + "grad_norm": 2.027897596359253, + "learning_rate": 1e-06, + "loss": 0.9454, + "mean_token_accuracy": 0.7108538746833801, + "num_tokens": 94162472.0, + "step": 3704 + }, + { + "epoch": 0.4068745881836152, + "grad_norm": 2.462939500808716, + "learning_rate": 1e-06, + "loss": 0.9949, + "mean_token_accuracy": 0.6933444142341614, + "num_tokens": 94184828.0, + "step": 3705 + }, + { + "epoch": 0.40698440588622886, + "grad_norm": 2.5953209400177, + "learning_rate": 1e-06, + "loss": 0.8776, + "mean_token_accuracy": 0.7254841327667236, + "num_tokens": 94204339.0, + "step": 3706 + }, + { + "epoch": 0.4070942235888425, + "grad_norm": 2.29384446144104, + "learning_rate": 1e-06, + "loss": 0.9368, + "mean_token_accuracy": 0.7112947702407837, + "num_tokens": 94227154.0, + "step": 3707 + }, + { + "epoch": 0.4072040412914562, + "grad_norm": 1.9586561918258667, + "learning_rate": 1e-06, + "loss": 1.0793, + "mean_token_accuracy": 0.6881831288337708, + "num_tokens": 94258434.0, + "step": 3708 + }, + { + "epoch": 0.40731385899406986, + "grad_norm": 2.305781602859497, + "learning_rate": 1e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.6983931660652161, + "num_tokens": 94282844.0, + "step": 3709 + }, + { + "epoch": 0.4074236766966835, + "grad_norm": 2.3807291984558105, + "learning_rate": 1e-06, + "loss": 0.9724, + "mean_token_accuracy": 0.706438422203064, + "num_tokens": 94305369.0, + "step": 3710 + }, + { + "epoch": 0.40753349439929715, + "grad_norm": 2.32676362991333, + "learning_rate": 1e-06, + "loss": 0.985, + "mean_token_accuracy": 0.70432448387146, + "num_tokens": 94329754.0, + "step": 3711 + }, + { + "epoch": 0.40764331210191085, + "grad_norm": 2.3813774585723877, + "learning_rate": 1e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.7142339944839478, + "num_tokens": 94351783.0, + "step": 3712 + }, + { + "epoch": 0.4077531298045245, + "grad_norm": 2.4701461791992188, + "learning_rate": 1e-06, + "loss": 1.0632, + "mean_token_accuracy": 0.684257984161377, + "num_tokens": 94373977.0, + "step": 3713 + }, + { + "epoch": 0.40786294750713814, + "grad_norm": 2.5382769107818604, + "learning_rate": 1e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.7092863917350769, + "num_tokens": 94393581.0, + "step": 3714 + }, + { + "epoch": 0.4079727652097518, + "grad_norm": 2.1750388145446777, + "learning_rate": 1e-06, + "loss": 0.9603, + "mean_token_accuracy": 0.7111256122589111, + "num_tokens": 94419666.0, + "step": 3715 + }, + { + "epoch": 0.4080825829123655, + "grad_norm": 2.329261064529419, + "learning_rate": 1e-06, + "loss": 1.0159, + "mean_token_accuracy": 0.7018001079559326, + "num_tokens": 94441519.0, + "step": 3716 + }, + { + "epoch": 0.40819240061497913, + "grad_norm": 2.506823778152466, + "learning_rate": 1e-06, + "loss": 0.9033, + "mean_token_accuracy": 0.72478848695755, + "num_tokens": 94461401.0, + "step": 3717 + }, + { + "epoch": 0.4083022183175928, + "grad_norm": 2.115076780319214, + "learning_rate": 1e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.6957703828811646, + "num_tokens": 94488617.0, + "step": 3718 + }, + { + "epoch": 0.4084120360202065, + "grad_norm": 2.226184129714966, + "learning_rate": 1e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.6926724910736084, + "num_tokens": 94513786.0, + "step": 3719 + }, + { + "epoch": 0.4085218537228201, + "grad_norm": 2.1831507682800293, + "learning_rate": 1e-06, + "loss": 1.0509, + "mean_token_accuracy": 0.6814447641372681, + "num_tokens": 94539017.0, + "step": 3720 + }, + { + "epoch": 0.4086316714254338, + "grad_norm": 2.217761754989624, + "learning_rate": 1e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.7184250354766846, + "num_tokens": 94564067.0, + "step": 3721 + }, + { + "epoch": 0.4087414891280474, + "grad_norm": 2.0858917236328125, + "learning_rate": 1e-06, + "loss": 1.023, + "mean_token_accuracy": 0.6995889544487, + "num_tokens": 94594490.0, + "step": 3722 + }, + { + "epoch": 0.4088513068306611, + "grad_norm": 2.2685298919677734, + "learning_rate": 1e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.6975313425064087, + "num_tokens": 94617871.0, + "step": 3723 + }, + { + "epoch": 0.40896112453327477, + "grad_norm": 2.055412769317627, + "learning_rate": 1e-06, + "loss": 1.0734, + "mean_token_accuracy": 0.6825747489929199, + "num_tokens": 94648728.0, + "step": 3724 + }, + { + "epoch": 0.4090709422358884, + "grad_norm": 2.098701238632202, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.7093717455863953, + "num_tokens": 94676300.0, + "step": 3725 + }, + { + "epoch": 0.40918075993850206, + "grad_norm": 2.2208011150360107, + "learning_rate": 1e-06, + "loss": 1.0254, + "mean_token_accuracy": 0.6872550845146179, + "num_tokens": 94703150.0, + "step": 3726 + }, + { + "epoch": 0.40929057764111576, + "grad_norm": 2.3758621215820312, + "learning_rate": 1e-06, + "loss": 1.0609, + "mean_token_accuracy": 0.683858335018158, + "num_tokens": 94728368.0, + "step": 3727 + }, + { + "epoch": 0.4094003953437294, + "grad_norm": 2.332249402999878, + "learning_rate": 1e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.7024332284927368, + "num_tokens": 94751341.0, + "step": 3728 + }, + { + "epoch": 0.40951021304634305, + "grad_norm": 2.0962135791778564, + "learning_rate": 1e-06, + "loss": 1.0263, + "mean_token_accuracy": 0.6910865306854248, + "num_tokens": 94778144.0, + "step": 3729 + }, + { + "epoch": 0.40962003074895675, + "grad_norm": 2.191554069519043, + "learning_rate": 1e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.6938871145248413, + "num_tokens": 94804024.0, + "step": 3730 + }, + { + "epoch": 0.4097298484515704, + "grad_norm": 2.2967464923858643, + "learning_rate": 1e-06, + "loss": 1.0196, + "mean_token_accuracy": 0.6940022706985474, + "num_tokens": 94828409.0, + "step": 3731 + }, + { + "epoch": 0.40983966615418405, + "grad_norm": 2.392390727996826, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7053990960121155, + "num_tokens": 94852423.0, + "step": 3732 + }, + { + "epoch": 0.4099494838567977, + "grad_norm": 2.3433961868286133, + "learning_rate": 1e-06, + "loss": 0.871, + "mean_token_accuracy": 0.7329244613647461, + "num_tokens": 94873571.0, + "step": 3733 + }, + { + "epoch": 0.4100593015594114, + "grad_norm": 2.4877231121063232, + "learning_rate": 1e-06, + "loss": 0.8544, + "mean_token_accuracy": 0.7323617935180664, + "num_tokens": 94893432.0, + "step": 3734 + }, + { + "epoch": 0.41016911926202504, + "grad_norm": 2.5050899982452393, + "learning_rate": 1e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7211899757385254, + "num_tokens": 94914713.0, + "step": 3735 + }, + { + "epoch": 0.4102789369646387, + "grad_norm": 2.351238250732422, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.7075027227401733, + "num_tokens": 94937679.0, + "step": 3736 + }, + { + "epoch": 0.4103887546672524, + "grad_norm": 2.415846586227417, + "learning_rate": 1e-06, + "loss": 0.9055, + "mean_token_accuracy": 0.7149406671524048, + "num_tokens": 94958310.0, + "step": 3737 + }, + { + "epoch": 0.41049857236986603, + "grad_norm": 2.129817008972168, + "learning_rate": 1e-06, + "loss": 0.8837, + "mean_token_accuracy": 0.7300323247909546, + "num_tokens": 94983789.0, + "step": 3738 + }, + { + "epoch": 0.4106083900724797, + "grad_norm": 2.100040912628174, + "learning_rate": 1e-06, + "loss": 1.0427, + "mean_token_accuracy": 0.684784471988678, + "num_tokens": 95012236.0, + "step": 3739 + }, + { + "epoch": 0.4107182077750933, + "grad_norm": 2.2838010787963867, + "learning_rate": 1e-06, + "loss": 0.9749, + "mean_token_accuracy": 0.6998955011367798, + "num_tokens": 95035998.0, + "step": 3740 + }, + { + "epoch": 0.410828025477707, + "grad_norm": 2.471252918243408, + "learning_rate": 1e-06, + "loss": 1.0066, + "mean_token_accuracy": 0.6919933557510376, + "num_tokens": 95056830.0, + "step": 3741 + }, + { + "epoch": 0.41093784318032067, + "grad_norm": 2.1699326038360596, + "learning_rate": 1e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.7028740644454956, + "num_tokens": 95082553.0, + "step": 3742 + }, + { + "epoch": 0.4110476608829343, + "grad_norm": 2.1074626445770264, + "learning_rate": 1e-06, + "loss": 1.0514, + "mean_token_accuracy": 0.6879056692123413, + "num_tokens": 95113333.0, + "step": 3743 + }, + { + "epoch": 0.41115747858554796, + "grad_norm": 2.188441038131714, + "learning_rate": 1e-06, + "loss": 1.0201, + "mean_token_accuracy": 0.6867117881774902, + "num_tokens": 95141528.0, + "step": 3744 + }, + { + "epoch": 0.41126729628816167, + "grad_norm": 2.3834683895111084, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.7010586261749268, + "num_tokens": 95163553.0, + "step": 3745 + }, + { + "epoch": 0.4113771139907753, + "grad_norm": 2.1463072299957275, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7056330442428589, + "num_tokens": 95188505.0, + "step": 3746 + }, + { + "epoch": 0.41148693169338896, + "grad_norm": 2.0833230018615723, + "learning_rate": 1e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.698411226272583, + "num_tokens": 95217160.0, + "step": 3747 + }, + { + "epoch": 0.41159674939600266, + "grad_norm": 2.069157123565674, + "learning_rate": 1e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.692290186882019, + "num_tokens": 95244586.0, + "step": 3748 + }, + { + "epoch": 0.4117065670986163, + "grad_norm": 2.001126289367676, + "learning_rate": 1e-06, + "loss": 1.0834, + "mean_token_accuracy": 0.6730421781539917, + "num_tokens": 95275795.0, + "step": 3749 + }, + { + "epoch": 0.41181638480122995, + "grad_norm": 2.3977084159851074, + "learning_rate": 1e-06, + "loss": 0.9945, + "mean_token_accuracy": 0.6991035342216492, + "num_tokens": 95299486.0, + "step": 3750 + }, + { + "epoch": 0.4119262025038436, + "grad_norm": 2.0786502361297607, + "learning_rate": 1e-06, + "loss": 1.043, + "mean_token_accuracy": 0.6851344108581543, + "num_tokens": 95328604.0, + "step": 3751 + }, + { + "epoch": 0.4120360202064573, + "grad_norm": 1.8375706672668457, + "learning_rate": 1e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.71148282289505, + "num_tokens": 95362614.0, + "step": 3752 + }, + { + "epoch": 0.41214583790907094, + "grad_norm": 2.164328098297119, + "learning_rate": 1e-06, + "loss": 0.981, + "mean_token_accuracy": 0.7059004306793213, + "num_tokens": 95388620.0, + "step": 3753 + }, + { + "epoch": 0.4122556556116846, + "grad_norm": 2.2133429050445557, + "learning_rate": 1e-06, + "loss": 1.0253, + "mean_token_accuracy": 0.7048587799072266, + "num_tokens": 95413904.0, + "step": 3754 + }, + { + "epoch": 0.4123654733142983, + "grad_norm": 2.2915265560150146, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.7067393064498901, + "num_tokens": 95438400.0, + "step": 3755 + }, + { + "epoch": 0.41247529101691194, + "grad_norm": 2.052070140838623, + "learning_rate": 1e-06, + "loss": 0.8789, + "mean_token_accuracy": 0.7272685170173645, + "num_tokens": 95464471.0, + "step": 3756 + }, + { + "epoch": 0.4125851087195256, + "grad_norm": 2.3905885219573975, + "learning_rate": 1e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.7070937156677246, + "num_tokens": 95487893.0, + "step": 3757 + }, + { + "epoch": 0.41269492642213923, + "grad_norm": 2.2631890773773193, + "learning_rate": 1e-06, + "loss": 1.0477, + "mean_token_accuracy": 0.6860329508781433, + "num_tokens": 95516527.0, + "step": 3758 + }, + { + "epoch": 0.41280474412475293, + "grad_norm": 1.9838823080062866, + "learning_rate": 1e-06, + "loss": 1.0448, + "mean_token_accuracy": 0.6863710880279541, + "num_tokens": 95548084.0, + "step": 3759 + }, + { + "epoch": 0.4129145618273666, + "grad_norm": 2.2638936042785645, + "learning_rate": 1e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.704401969909668, + "num_tokens": 95573393.0, + "step": 3760 + }, + { + "epoch": 0.4130243795299802, + "grad_norm": 2.313399314880371, + "learning_rate": 1e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.7072160840034485, + "num_tokens": 95597998.0, + "step": 3761 + }, + { + "epoch": 0.41313419723259387, + "grad_norm": 2.186110734939575, + "learning_rate": 1e-06, + "loss": 1.0129, + "mean_token_accuracy": 0.6970512866973877, + "num_tokens": 95625583.0, + "step": 3762 + }, + { + "epoch": 0.41324401493520757, + "grad_norm": 2.082702875137329, + "learning_rate": 1e-06, + "loss": 1.0686, + "mean_token_accuracy": 0.6812991499900818, + "num_tokens": 95655845.0, + "step": 3763 + }, + { + "epoch": 0.4133538326378212, + "grad_norm": 2.1742262840270996, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7077208161354065, + "num_tokens": 95683827.0, + "step": 3764 + }, + { + "epoch": 0.41346365034043486, + "grad_norm": 2.238624095916748, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7117794752120972, + "num_tokens": 95708472.0, + "step": 3765 + }, + { + "epoch": 0.41357346804304856, + "grad_norm": 2.34187388420105, + "learning_rate": 1e-06, + "loss": 1.0307, + "mean_token_accuracy": 0.6858025789260864, + "num_tokens": 95732150.0, + "step": 3766 + }, + { + "epoch": 0.4136832857456622, + "grad_norm": 2.177459239959717, + "learning_rate": 1e-06, + "loss": 1.0186, + "mean_token_accuracy": 0.6949227452278137, + "num_tokens": 95759018.0, + "step": 3767 + }, + { + "epoch": 0.41379310344827586, + "grad_norm": 2.408430337905884, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.707638144493103, + "num_tokens": 95784840.0, + "step": 3768 + }, + { + "epoch": 0.4139029211508895, + "grad_norm": 2.4377713203430176, + "learning_rate": 1e-06, + "loss": 0.9651, + "mean_token_accuracy": 0.7013927698135376, + "num_tokens": 95805698.0, + "step": 3769 + }, + { + "epoch": 0.4140127388535032, + "grad_norm": 2.3274314403533936, + "learning_rate": 1e-06, + "loss": 1.0428, + "mean_token_accuracy": 0.6908010244369507, + "num_tokens": 95831126.0, + "step": 3770 + }, + { + "epoch": 0.41412255655611685, + "grad_norm": 2.1015570163726807, + "learning_rate": 1e-06, + "loss": 0.9824, + "mean_token_accuracy": 0.6989943981170654, + "num_tokens": 95858439.0, + "step": 3771 + }, + { + "epoch": 0.4142323742587305, + "grad_norm": 2.112466335296631, + "learning_rate": 1e-06, + "loss": 0.8977, + "mean_token_accuracy": 0.7232626676559448, + "num_tokens": 95886026.0, + "step": 3772 + }, + { + "epoch": 0.41434219196134414, + "grad_norm": 2.4464316368103027, + "learning_rate": 1e-06, + "loss": 1.0047, + "mean_token_accuracy": 0.6931928992271423, + "num_tokens": 95907545.0, + "step": 3773 + }, + { + "epoch": 0.41445200966395784, + "grad_norm": 2.100050687789917, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.7065213918685913, + "num_tokens": 95936929.0, + "step": 3774 + }, + { + "epoch": 0.4145618273665715, + "grad_norm": 2.410844564437866, + "learning_rate": 1e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.7251220941543579, + "num_tokens": 95960401.0, + "step": 3775 + }, + { + "epoch": 0.41467164506918514, + "grad_norm": 2.2204184532165527, + "learning_rate": 1e-06, + "loss": 1.0056, + "mean_token_accuracy": 0.6923134326934814, + "num_tokens": 95988680.0, + "step": 3776 + }, + { + "epoch": 0.41478146277179884, + "grad_norm": 2.4805104732513428, + "learning_rate": 1e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.7141982913017273, + "num_tokens": 96008548.0, + "step": 3777 + }, + { + "epoch": 0.4148912804744125, + "grad_norm": 2.0764973163604736, + "learning_rate": 1e-06, + "loss": 1.1033, + "mean_token_accuracy": 0.670039176940918, + "num_tokens": 96039156.0, + "step": 3778 + }, + { + "epoch": 0.41500109817702613, + "grad_norm": 2.173204183578491, + "learning_rate": 1e-06, + "loss": 1.0125, + "mean_token_accuracy": 0.698237419128418, + "num_tokens": 96066146.0, + "step": 3779 + }, + { + "epoch": 0.4151109158796398, + "grad_norm": 2.6532976627349854, + "learning_rate": 1e-06, + "loss": 0.9972, + "mean_token_accuracy": 0.6996811628341675, + "num_tokens": 96086328.0, + "step": 3780 + }, + { + "epoch": 0.4152207335822535, + "grad_norm": 2.8277969360351562, + "learning_rate": 1e-06, + "loss": 0.926, + "mean_token_accuracy": 0.7137503027915955, + "num_tokens": 96102963.0, + "step": 3781 + }, + { + "epoch": 0.4153305512848671, + "grad_norm": 2.108574867248535, + "learning_rate": 1e-06, + "loss": 1.0123, + "mean_token_accuracy": 0.6911308765411377, + "num_tokens": 96134966.0, + "step": 3782 + }, + { + "epoch": 0.41544036898748077, + "grad_norm": 2.4955854415893555, + "learning_rate": 1e-06, + "loss": 0.8667, + "mean_token_accuracy": 0.7283694744110107, + "num_tokens": 96154605.0, + "step": 3783 + }, + { + "epoch": 0.41555018669009447, + "grad_norm": 2.2252790927886963, + "learning_rate": 1e-06, + "loss": 1.028, + "mean_token_accuracy": 0.6897639036178589, + "num_tokens": 96179539.0, + "step": 3784 + }, + { + "epoch": 0.4156600043927081, + "grad_norm": 2.4314467906951904, + "learning_rate": 1e-06, + "loss": 1.0257, + "mean_token_accuracy": 0.6964122653007507, + "num_tokens": 96201457.0, + "step": 3785 + }, + { + "epoch": 0.41576982209532176, + "grad_norm": 2.1325533390045166, + "learning_rate": 1e-06, + "loss": 1.0727, + "mean_token_accuracy": 0.6858051419258118, + "num_tokens": 96226936.0, + "step": 3786 + }, + { + "epoch": 0.4158796397979354, + "grad_norm": 1.9953125715255737, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7245821952819824, + "num_tokens": 96254323.0, + "step": 3787 + }, + { + "epoch": 0.4159894575005491, + "grad_norm": 2.1702637672424316, + "learning_rate": 1e-06, + "loss": 0.9801, + "mean_token_accuracy": 0.7009766101837158, + "num_tokens": 96282300.0, + "step": 3788 + }, + { + "epoch": 0.41609927520316276, + "grad_norm": 2.3920559883117676, + "learning_rate": 1e-06, + "loss": 1.0306, + "mean_token_accuracy": 0.6861909627914429, + "num_tokens": 96306330.0, + "step": 3789 + }, + { + "epoch": 0.4162090929057764, + "grad_norm": 2.4094502925872803, + "learning_rate": 1e-06, + "loss": 0.896, + "mean_token_accuracy": 0.7246808409690857, + "num_tokens": 96327087.0, + "step": 3790 + }, + { + "epoch": 0.41631891060839005, + "grad_norm": 2.2613675594329834, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.7164655923843384, + "num_tokens": 96347991.0, + "step": 3791 + }, + { + "epoch": 0.41642872831100375, + "grad_norm": 2.03326678276062, + "learning_rate": 1e-06, + "loss": 1.0832, + "mean_token_accuracy": 0.6729429364204407, + "num_tokens": 96378654.0, + "step": 3792 + }, + { + "epoch": 0.4165385460136174, + "grad_norm": 2.0205891132354736, + "learning_rate": 1e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.7121238112449646, + "num_tokens": 96407187.0, + "step": 3793 + }, + { + "epoch": 0.41664836371623104, + "grad_norm": 1.900827407836914, + "learning_rate": 1e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.7106174230575562, + "num_tokens": 96439966.0, + "step": 3794 + }, + { + "epoch": 0.41675818141884474, + "grad_norm": 2.134105920791626, + "learning_rate": 1e-06, + "loss": 1.031, + "mean_token_accuracy": 0.6869654059410095, + "num_tokens": 96467175.0, + "step": 3795 + }, + { + "epoch": 0.4168679991214584, + "grad_norm": 2.0307700634002686, + "learning_rate": 1e-06, + "loss": 1.007, + "mean_token_accuracy": 0.7012158632278442, + "num_tokens": 96495692.0, + "step": 3796 + }, + { + "epoch": 0.41697781682407203, + "grad_norm": 2.1892852783203125, + "learning_rate": 1e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.6941688060760498, + "num_tokens": 96522188.0, + "step": 3797 + }, + { + "epoch": 0.4170876345266857, + "grad_norm": 2.5196340084075928, + "learning_rate": 1e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.7157723903656006, + "num_tokens": 96542140.0, + "step": 3798 + }, + { + "epoch": 0.4171974522292994, + "grad_norm": 2.690418243408203, + "learning_rate": 1e-06, + "loss": 0.9058, + "mean_token_accuracy": 0.720727801322937, + "num_tokens": 96559999.0, + "step": 3799 + }, + { + "epoch": 0.417307269931913, + "grad_norm": 2.221442461013794, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.7032859921455383, + "num_tokens": 96586329.0, + "step": 3800 + }, + { + "epoch": 0.4174170876345267, + "grad_norm": 2.483682632446289, + "learning_rate": 1e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.6971914172172546, + "num_tokens": 96607497.0, + "step": 3801 + }, + { + "epoch": 0.4175269053371403, + "grad_norm": 2.111537218093872, + "learning_rate": 1e-06, + "loss": 0.97, + "mean_token_accuracy": 0.7060970067977905, + "num_tokens": 96635174.0, + "step": 3802 + }, + { + "epoch": 0.417636723039754, + "grad_norm": 2.3780250549316406, + "learning_rate": 1e-06, + "loss": 0.911, + "mean_token_accuracy": 0.7180277705192566, + "num_tokens": 96658271.0, + "step": 3803 + }, + { + "epoch": 0.41774654074236767, + "grad_norm": 2.294419527053833, + "learning_rate": 1e-06, + "loss": 0.8723, + "mean_token_accuracy": 0.7455042600631714, + "num_tokens": 96681437.0, + "step": 3804 + }, + { + "epoch": 0.4178563584449813, + "grad_norm": 2.227503538131714, + "learning_rate": 1e-06, + "loss": 1.0388, + "mean_token_accuracy": 0.7030559778213501, + "num_tokens": 96707204.0, + "step": 3805 + }, + { + "epoch": 0.417966176147595, + "grad_norm": 2.380234718322754, + "learning_rate": 1e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7061516046524048, + "num_tokens": 96729820.0, + "step": 3806 + }, + { + "epoch": 0.41807599385020866, + "grad_norm": 1.9936845302581787, + "learning_rate": 1e-06, + "loss": 1.0151, + "mean_token_accuracy": 0.6975980401039124, + "num_tokens": 96760354.0, + "step": 3807 + }, + { + "epoch": 0.4181858115528223, + "grad_norm": 2.2287814617156982, + "learning_rate": 1e-06, + "loss": 0.8597, + "mean_token_accuracy": 0.7320727109909058, + "num_tokens": 96784513.0, + "step": 3808 + }, + { + "epoch": 0.41829562925543595, + "grad_norm": 2.4705262184143066, + "learning_rate": 1e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.6948115825653076, + "num_tokens": 96805671.0, + "step": 3809 + }, + { + "epoch": 0.41840544695804965, + "grad_norm": 2.293097972869873, + "learning_rate": 1e-06, + "loss": 1.0129, + "mean_token_accuracy": 0.6885854005813599, + "num_tokens": 96830405.0, + "step": 3810 + }, + { + "epoch": 0.4185152646606633, + "grad_norm": 1.8419967889785767, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7267435789108276, + "num_tokens": 96865526.0, + "step": 3811 + }, + { + "epoch": 0.41862508236327695, + "grad_norm": 2.3020126819610596, + "learning_rate": 1e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.7011600732803345, + "num_tokens": 96889645.0, + "step": 3812 + }, + { + "epoch": 0.41873490006589065, + "grad_norm": 2.187211275100708, + "learning_rate": 1e-06, + "loss": 1.0375, + "mean_token_accuracy": 0.683447003364563, + "num_tokens": 96916840.0, + "step": 3813 + }, + { + "epoch": 0.4188447177685043, + "grad_norm": 2.4257912635803223, + "learning_rate": 1e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.6975844502449036, + "num_tokens": 96938538.0, + "step": 3814 + }, + { + "epoch": 0.41895453547111794, + "grad_norm": 1.9166226387023926, + "learning_rate": 1e-06, + "loss": 1.0296, + "mean_token_accuracy": 0.6877102851867676, + "num_tokens": 96971585.0, + "step": 3815 + }, + { + "epoch": 0.4190643531737316, + "grad_norm": 2.2403180599212646, + "learning_rate": 1e-06, + "loss": 1.0025, + "mean_token_accuracy": 0.6971608400344849, + "num_tokens": 96995642.0, + "step": 3816 + }, + { + "epoch": 0.4191741708763453, + "grad_norm": 2.4959452152252197, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7132337093353271, + "num_tokens": 97017937.0, + "step": 3817 + }, + { + "epoch": 0.41928398857895893, + "grad_norm": 2.132114887237549, + "learning_rate": 1e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7220407724380493, + "num_tokens": 97045148.0, + "step": 3818 + }, + { + "epoch": 0.4193938062815726, + "grad_norm": 2.5304501056671143, + "learning_rate": 1e-06, + "loss": 1.025, + "mean_token_accuracy": 0.6916495561599731, + "num_tokens": 97066934.0, + "step": 3819 + }, + { + "epoch": 0.4195036239841862, + "grad_norm": 2.348869800567627, + "learning_rate": 1e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.7031615972518921, + "num_tokens": 97092327.0, + "step": 3820 + }, + { + "epoch": 0.4196134416867999, + "grad_norm": 2.2584004402160645, + "learning_rate": 1e-06, + "loss": 0.9368, + "mean_token_accuracy": 0.7266826033592224, + "num_tokens": 97116416.0, + "step": 3821 + }, + { + "epoch": 0.41972325938941357, + "grad_norm": 2.4450740814208984, + "learning_rate": 1e-06, + "loss": 0.8582, + "mean_token_accuracy": 0.7267217040061951, + "num_tokens": 97136776.0, + "step": 3822 + }, + { + "epoch": 0.4198330770920272, + "grad_norm": 2.3231759071350098, + "learning_rate": 1e-06, + "loss": 0.9974, + "mean_token_accuracy": 0.7074248790740967, + "num_tokens": 97160382.0, + "step": 3823 + }, + { + "epoch": 0.4199428947946409, + "grad_norm": 2.3081986904144287, + "learning_rate": 1e-06, + "loss": 1.0933, + "mean_token_accuracy": 0.6737709641456604, + "num_tokens": 97186144.0, + "step": 3824 + }, + { + "epoch": 0.42005271249725457, + "grad_norm": 2.3875162601470947, + "learning_rate": 1e-06, + "loss": 0.8502, + "mean_token_accuracy": 0.7288298606872559, + "num_tokens": 97208031.0, + "step": 3825 + }, + { + "epoch": 0.4201625301998682, + "grad_norm": 2.3656883239746094, + "learning_rate": 1e-06, + "loss": 0.869, + "mean_token_accuracy": 0.7299972772598267, + "num_tokens": 97229764.0, + "step": 3826 + }, + { + "epoch": 0.42027234790248186, + "grad_norm": 2.3220877647399902, + "learning_rate": 1e-06, + "loss": 1.0299, + "mean_token_accuracy": 0.692374587059021, + "num_tokens": 97255571.0, + "step": 3827 + }, + { + "epoch": 0.42038216560509556, + "grad_norm": 2.5320942401885986, + "learning_rate": 1e-06, + "loss": 0.9709, + "mean_token_accuracy": 0.7040061950683594, + "num_tokens": 97275981.0, + "step": 3828 + }, + { + "epoch": 0.4204919833077092, + "grad_norm": 2.2623345851898193, + "learning_rate": 1e-06, + "loss": 0.9871, + "mean_token_accuracy": 0.6972007155418396, + "num_tokens": 97303137.0, + "step": 3829 + }, + { + "epoch": 0.42060180101032285, + "grad_norm": 2.2971537113189697, + "learning_rate": 1e-06, + "loss": 1.0441, + "mean_token_accuracy": 0.6820782423019409, + "num_tokens": 97326943.0, + "step": 3830 + }, + { + "epoch": 0.42071161871293655, + "grad_norm": 2.485494613647461, + "learning_rate": 1e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.7207808494567871, + "num_tokens": 97348581.0, + "step": 3831 + }, + { + "epoch": 0.4208214364155502, + "grad_norm": 2.1587836742401123, + "learning_rate": 1e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.6981468200683594, + "num_tokens": 97373077.0, + "step": 3832 + }, + { + "epoch": 0.42093125411816384, + "grad_norm": 2.2688491344451904, + "learning_rate": 1e-06, + "loss": 0.9511, + "mean_token_accuracy": 0.709177553653717, + "num_tokens": 97397747.0, + "step": 3833 + }, + { + "epoch": 0.4210410718207775, + "grad_norm": 2.35121750831604, + "learning_rate": 1e-06, + "loss": 1.033, + "mean_token_accuracy": 0.6912554502487183, + "num_tokens": 97421611.0, + "step": 3834 + }, + { + "epoch": 0.4211508895233912, + "grad_norm": 2.1223623752593994, + "learning_rate": 1e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.7010539770126343, + "num_tokens": 97450004.0, + "step": 3835 + }, + { + "epoch": 0.42126070722600484, + "grad_norm": 2.2088303565979004, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7151331901550293, + "num_tokens": 97476338.0, + "step": 3836 + }, + { + "epoch": 0.4213705249286185, + "grad_norm": 2.2063961029052734, + "learning_rate": 1e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.6931586861610413, + "num_tokens": 97503734.0, + "step": 3837 + }, + { + "epoch": 0.42148034263123213, + "grad_norm": 2.1407201290130615, + "learning_rate": 1e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.7158176302909851, + "num_tokens": 97531585.0, + "step": 3838 + }, + { + "epoch": 0.42159016033384583, + "grad_norm": 1.935565710067749, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7107245922088623, + "num_tokens": 97560929.0, + "step": 3839 + }, + { + "epoch": 0.4216999780364595, + "grad_norm": 2.3995938301086426, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.7040696740150452, + "num_tokens": 97585003.0, + "step": 3840 + }, + { + "epoch": 0.4218097957390731, + "grad_norm": 2.3194005489349365, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.7074884176254272, + "num_tokens": 97607453.0, + "step": 3841 + }, + { + "epoch": 0.4219196134416868, + "grad_norm": 1.9395378828048706, + "learning_rate": 1e-06, + "loss": 1.075, + "mean_token_accuracy": 0.6828060150146484, + "num_tokens": 97642515.0, + "step": 3842 + }, + { + "epoch": 0.42202943114430047, + "grad_norm": 2.007084608078003, + "learning_rate": 1e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.7208213806152344, + "num_tokens": 97671683.0, + "step": 3843 + }, + { + "epoch": 0.4221392488469141, + "grad_norm": 2.2108664512634277, + "learning_rate": 1e-06, + "loss": 1.0049, + "mean_token_accuracy": 0.7018295526504517, + "num_tokens": 97696417.0, + "step": 3844 + }, + { + "epoch": 0.42224906654952776, + "grad_norm": 1.9984912872314453, + "learning_rate": 1e-06, + "loss": 1.066, + "mean_token_accuracy": 0.6811652779579163, + "num_tokens": 97725922.0, + "step": 3845 + }, + { + "epoch": 0.42235888425214146, + "grad_norm": 2.1917052268981934, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.700515627861023, + "num_tokens": 97753144.0, + "step": 3846 + }, + { + "epoch": 0.4224687019547551, + "grad_norm": 2.1885828971862793, + "learning_rate": 1e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.7182453870773315, + "num_tokens": 97776916.0, + "step": 3847 + }, + { + "epoch": 0.42257851965736876, + "grad_norm": 2.157986640930176, + "learning_rate": 1e-06, + "loss": 0.9958, + "mean_token_accuracy": 0.7084852457046509, + "num_tokens": 97804027.0, + "step": 3848 + }, + { + "epoch": 0.4226883373599824, + "grad_norm": 2.1971797943115234, + "learning_rate": 1e-06, + "loss": 1.0126, + "mean_token_accuracy": 0.6965925097465515, + "num_tokens": 97831068.0, + "step": 3849 + }, + { + "epoch": 0.4227981550625961, + "grad_norm": 2.0979411602020264, + "learning_rate": 1e-06, + "loss": 1.0043, + "mean_token_accuracy": 0.6986382007598877, + "num_tokens": 97858784.0, + "step": 3850 + }, + { + "epoch": 0.42290797276520975, + "grad_norm": 2.38974928855896, + "learning_rate": 1e-06, + "loss": 1.0895, + "mean_token_accuracy": 0.6912219524383545, + "num_tokens": 97882403.0, + "step": 3851 + }, + { + "epoch": 0.4230177904678234, + "grad_norm": 2.3108484745025635, + "learning_rate": 1e-06, + "loss": 0.9056, + "mean_token_accuracy": 0.7230691313743591, + "num_tokens": 97904614.0, + "step": 3852 + }, + { + "epoch": 0.4231276081704371, + "grad_norm": 2.2990596294403076, + "learning_rate": 1e-06, + "loss": 0.9972, + "mean_token_accuracy": 0.6945631504058838, + "num_tokens": 97928999.0, + "step": 3853 + }, + { + "epoch": 0.42323742587305074, + "grad_norm": 2.217363119125366, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7055834531784058, + "num_tokens": 97954484.0, + "step": 3854 + }, + { + "epoch": 0.4233472435756644, + "grad_norm": 2.271336317062378, + "learning_rate": 1e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.7105051279067993, + "num_tokens": 97978011.0, + "step": 3855 + }, + { + "epoch": 0.42345706127827804, + "grad_norm": 2.3860275745391846, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7042009830474854, + "num_tokens": 98001198.0, + "step": 3856 + }, + { + "epoch": 0.42356687898089174, + "grad_norm": 2.3166356086730957, + "learning_rate": 1e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.7069675922393799, + "num_tokens": 98024457.0, + "step": 3857 + }, + { + "epoch": 0.4236766966835054, + "grad_norm": 2.283078908920288, + "learning_rate": 1e-06, + "loss": 1.0064, + "mean_token_accuracy": 0.7003194093704224, + "num_tokens": 98048621.0, + "step": 3858 + }, + { + "epoch": 0.42378651438611903, + "grad_norm": 2.2628591060638428, + "learning_rate": 1e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.7192785143852234, + "num_tokens": 98073379.0, + "step": 3859 + }, + { + "epoch": 0.42389633208873273, + "grad_norm": 2.0536885261535645, + "learning_rate": 1e-06, + "loss": 1.0198, + "mean_token_accuracy": 0.6981955766677856, + "num_tokens": 98103000.0, + "step": 3860 + }, + { + "epoch": 0.4240061497913464, + "grad_norm": 2.1542441844940186, + "learning_rate": 1e-06, + "loss": 0.9748, + "mean_token_accuracy": 0.703810453414917, + "num_tokens": 98128970.0, + "step": 3861 + }, + { + "epoch": 0.42411596749396, + "grad_norm": 2.071850299835205, + "learning_rate": 1e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.7064675092697144, + "num_tokens": 98158593.0, + "step": 3862 + }, + { + "epoch": 0.42422578519657367, + "grad_norm": 2.692255973815918, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7136509418487549, + "num_tokens": 98177565.0, + "step": 3863 + }, + { + "epoch": 0.42433560289918737, + "grad_norm": 2.11726450920105, + "learning_rate": 1e-06, + "loss": 1.0286, + "mean_token_accuracy": 0.691687822341919, + "num_tokens": 98205938.0, + "step": 3864 + }, + { + "epoch": 0.424445420601801, + "grad_norm": 2.348806381225586, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.7039550542831421, + "num_tokens": 98228915.0, + "step": 3865 + }, + { + "epoch": 0.42455523830441466, + "grad_norm": 2.095188617706299, + "learning_rate": 1e-06, + "loss": 1.002, + "mean_token_accuracy": 0.692815899848938, + "num_tokens": 98258833.0, + "step": 3866 + }, + { + "epoch": 0.4246650560070283, + "grad_norm": 2.1396942138671875, + "learning_rate": 1e-06, + "loss": 1.0096, + "mean_token_accuracy": 0.692388653755188, + "num_tokens": 98288309.0, + "step": 3867 + }, + { + "epoch": 0.424774873709642, + "grad_norm": 2.0583763122558594, + "learning_rate": 1e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.7260967493057251, + "num_tokens": 98316155.0, + "step": 3868 + }, + { + "epoch": 0.42488469141225566, + "grad_norm": 2.395805597305298, + "learning_rate": 1e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.726722002029419, + "num_tokens": 98337010.0, + "step": 3869 + }, + { + "epoch": 0.4249945091148693, + "grad_norm": 2.1088101863861084, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.6963474750518799, + "num_tokens": 98364624.0, + "step": 3870 + }, + { + "epoch": 0.425104326817483, + "grad_norm": 2.383615016937256, + "learning_rate": 1e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.713405966758728, + "num_tokens": 98388157.0, + "step": 3871 + }, + { + "epoch": 0.42521414452009665, + "grad_norm": 2.085881471633911, + "learning_rate": 1e-06, + "loss": 1.0172, + "mean_token_accuracy": 0.7003874778747559, + "num_tokens": 98416535.0, + "step": 3872 + }, + { + "epoch": 0.4253239622227103, + "grad_norm": 2.1683177947998047, + "learning_rate": 1e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.7048483490943909, + "num_tokens": 98442440.0, + "step": 3873 + }, + { + "epoch": 0.42543377992532394, + "grad_norm": 2.478447914123535, + "learning_rate": 1e-06, + "loss": 0.9989, + "mean_token_accuracy": 0.7049412727355957, + "num_tokens": 98463668.0, + "step": 3874 + }, + { + "epoch": 0.42554359762793764, + "grad_norm": 2.1135001182556152, + "learning_rate": 1e-06, + "loss": 1.0212, + "mean_token_accuracy": 0.6924845576286316, + "num_tokens": 98493518.0, + "step": 3875 + }, + { + "epoch": 0.4256534153305513, + "grad_norm": 1.9629662036895752, + "learning_rate": 1e-06, + "loss": 1.0021, + "mean_token_accuracy": 0.693600058555603, + "num_tokens": 98528537.0, + "step": 3876 + }, + { + "epoch": 0.42576323303316493, + "grad_norm": 2.1463546752929688, + "learning_rate": 1e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.6991292238235474, + "num_tokens": 98553660.0, + "step": 3877 + }, + { + "epoch": 0.4258730507357786, + "grad_norm": 2.504009246826172, + "learning_rate": 1e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.7195155024528503, + "num_tokens": 98573112.0, + "step": 3878 + }, + { + "epoch": 0.4259828684383923, + "grad_norm": 2.2964696884155273, + "learning_rate": 1e-06, + "loss": 1.024, + "mean_token_accuracy": 0.6872431039810181, + "num_tokens": 98597971.0, + "step": 3879 + }, + { + "epoch": 0.4260926861410059, + "grad_norm": 2.6353955268859863, + "learning_rate": 1e-06, + "loss": 1.0215, + "mean_token_accuracy": 0.686388373374939, + "num_tokens": 98618467.0, + "step": 3880 + }, + { + "epoch": 0.4262025038436196, + "grad_norm": 2.1246960163116455, + "learning_rate": 1e-06, + "loss": 0.9381, + "mean_token_accuracy": 0.7140563130378723, + "num_tokens": 98644623.0, + "step": 3881 + }, + { + "epoch": 0.4263123215462333, + "grad_norm": 2.09757137298584, + "learning_rate": 1e-06, + "loss": 1.0144, + "mean_token_accuracy": 0.695176362991333, + "num_tokens": 98671003.0, + "step": 3882 + }, + { + "epoch": 0.4264221392488469, + "grad_norm": 2.1887905597686768, + "learning_rate": 1e-06, + "loss": 1.0047, + "mean_token_accuracy": 0.6916124224662781, + "num_tokens": 98698992.0, + "step": 3883 + }, + { + "epoch": 0.42653195695146057, + "grad_norm": 2.418314218521118, + "learning_rate": 1e-06, + "loss": 0.8559, + "mean_token_accuracy": 0.7319836616516113, + "num_tokens": 98720015.0, + "step": 3884 + }, + { + "epoch": 0.4266417746540742, + "grad_norm": 2.342883586883545, + "learning_rate": 1e-06, + "loss": 1.0343, + "mean_token_accuracy": 0.6922115087509155, + "num_tokens": 98745436.0, + "step": 3885 + }, + { + "epoch": 0.4267515923566879, + "grad_norm": 2.0407328605651855, + "learning_rate": 1e-06, + "loss": 0.9109, + "mean_token_accuracy": 0.7152812480926514, + "num_tokens": 98771912.0, + "step": 3886 + }, + { + "epoch": 0.42686141005930156, + "grad_norm": 2.1515893936157227, + "learning_rate": 1e-06, + "loss": 1.1256, + "mean_token_accuracy": 0.6656541228294373, + "num_tokens": 98799606.0, + "step": 3887 + }, + { + "epoch": 0.4269712277619152, + "grad_norm": 2.3654541969299316, + "learning_rate": 1e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.7044954299926758, + "num_tokens": 98821410.0, + "step": 3888 + }, + { + "epoch": 0.4270810454645289, + "grad_norm": 2.301283121109009, + "learning_rate": 1e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.6978143453598022, + "num_tokens": 98845152.0, + "step": 3889 + }, + { + "epoch": 0.42719086316714255, + "grad_norm": 2.26074481010437, + "learning_rate": 1e-06, + "loss": 1.0143, + "mean_token_accuracy": 0.6908949017524719, + "num_tokens": 98871023.0, + "step": 3890 + }, + { + "epoch": 0.4273006808697562, + "grad_norm": 2.3979239463806152, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7061004638671875, + "num_tokens": 98892890.0, + "step": 3891 + }, + { + "epoch": 0.42741049857236985, + "grad_norm": 2.3996312618255615, + "learning_rate": 1e-06, + "loss": 0.8659, + "mean_token_accuracy": 0.7292884588241577, + "num_tokens": 98913821.0, + "step": 3892 + }, + { + "epoch": 0.42752031627498355, + "grad_norm": 2.4481112957000732, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.7174045443534851, + "num_tokens": 98936696.0, + "step": 3893 + }, + { + "epoch": 0.4276301339775972, + "grad_norm": 1.951090931892395, + "learning_rate": 1e-06, + "loss": 1.0972, + "mean_token_accuracy": 0.676959753036499, + "num_tokens": 98970512.0, + "step": 3894 + }, + { + "epoch": 0.42773995168021084, + "grad_norm": 1.994856595993042, + "learning_rate": 1e-06, + "loss": 1.0201, + "mean_token_accuracy": 0.6972161531448364, + "num_tokens": 99002720.0, + "step": 3895 + }, + { + "epoch": 0.4278497693828245, + "grad_norm": 2.5146687030792236, + "learning_rate": 1e-06, + "loss": 0.9203, + "mean_token_accuracy": 0.715787410736084, + "num_tokens": 99022554.0, + "step": 3896 + }, + { + "epoch": 0.4279595870854382, + "grad_norm": 2.555353879928589, + "learning_rate": 1e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.7051267027854919, + "num_tokens": 99042908.0, + "step": 3897 + }, + { + "epoch": 0.42806940478805183, + "grad_norm": 1.9206799268722534, + "learning_rate": 1e-06, + "loss": 1.0468, + "mean_token_accuracy": 0.6838650107383728, + "num_tokens": 99077762.0, + "step": 3898 + }, + { + "epoch": 0.4281792224906655, + "grad_norm": 2.4207499027252197, + "learning_rate": 1e-06, + "loss": 0.9635, + "mean_token_accuracy": 0.7114775776863098, + "num_tokens": 99100027.0, + "step": 3899 + }, + { + "epoch": 0.4282890401932792, + "grad_norm": 2.0354623794555664, + "learning_rate": 1e-06, + "loss": 1.1153, + "mean_token_accuracy": 0.6724589467048645, + "num_tokens": 99132699.0, + "step": 3900 + }, + { + "epoch": 0.4283988578958928, + "grad_norm": 2.64479398727417, + "learning_rate": 1e-06, + "loss": 0.8517, + "mean_token_accuracy": 0.7313292026519775, + "num_tokens": 99150260.0, + "step": 3901 + }, + { + "epoch": 0.42850867559850647, + "grad_norm": 1.866356611251831, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.7141433954238892, + "num_tokens": 99183984.0, + "step": 3902 + }, + { + "epoch": 0.4286184933011201, + "grad_norm": 2.2986488342285156, + "learning_rate": 1e-06, + "loss": 0.9939, + "mean_token_accuracy": 0.6974764466285706, + "num_tokens": 99208171.0, + "step": 3903 + }, + { + "epoch": 0.4287283110037338, + "grad_norm": 2.1205062866210938, + "learning_rate": 1e-06, + "loss": 0.9033, + "mean_token_accuracy": 0.7210182547569275, + "num_tokens": 99234575.0, + "step": 3904 + }, + { + "epoch": 0.42883812870634747, + "grad_norm": 2.2217459678649902, + "learning_rate": 1e-06, + "loss": 1.0511, + "mean_token_accuracy": 0.6817567348480225, + "num_tokens": 99261829.0, + "step": 3905 + }, + { + "epoch": 0.4289479464089611, + "grad_norm": 2.0676112174987793, + "learning_rate": 1e-06, + "loss": 1.0667, + "mean_token_accuracy": 0.6817851066589355, + "num_tokens": 99290983.0, + "step": 3906 + }, + { + "epoch": 0.4290577641115748, + "grad_norm": 2.0428643226623535, + "learning_rate": 1e-06, + "loss": 1.0079, + "mean_token_accuracy": 0.7005082368850708, + "num_tokens": 99322711.0, + "step": 3907 + }, + { + "epoch": 0.42916758181418846, + "grad_norm": 2.491783857345581, + "learning_rate": 1e-06, + "loss": 0.968, + "mean_token_accuracy": 0.7089943885803223, + "num_tokens": 99345208.0, + "step": 3908 + }, + { + "epoch": 0.4292773995168021, + "grad_norm": 2.142589569091797, + "learning_rate": 1e-06, + "loss": 1.0562, + "mean_token_accuracy": 0.681975245475769, + "num_tokens": 99372798.0, + "step": 3909 + }, + { + "epoch": 0.42938721721941575, + "grad_norm": 2.2332100868225098, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7309871912002563, + "num_tokens": 99395813.0, + "step": 3910 + }, + { + "epoch": 0.42949703492202945, + "grad_norm": 2.4412994384765625, + "learning_rate": 1e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.7218427062034607, + "num_tokens": 99417875.0, + "step": 3911 + }, + { + "epoch": 0.4296068526246431, + "grad_norm": 2.3583245277404785, + "learning_rate": 1e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.714371919631958, + "num_tokens": 99440970.0, + "step": 3912 + }, + { + "epoch": 0.42971667032725674, + "grad_norm": 2.084261417388916, + "learning_rate": 1e-06, + "loss": 0.9953, + "mean_token_accuracy": 0.6935467720031738, + "num_tokens": 99466799.0, + "step": 3913 + }, + { + "epoch": 0.4298264880298704, + "grad_norm": 2.5071678161621094, + "learning_rate": 1e-06, + "loss": 0.946, + "mean_token_accuracy": 0.7132048606872559, + "num_tokens": 99486958.0, + "step": 3914 + }, + { + "epoch": 0.4299363057324841, + "grad_norm": 1.9566866159439087, + "learning_rate": 1e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.6923444271087646, + "num_tokens": 99519948.0, + "step": 3915 + }, + { + "epoch": 0.43004612343509774, + "grad_norm": 2.287199020385742, + "learning_rate": 1e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.7200056314468384, + "num_tokens": 99543926.0, + "step": 3916 + }, + { + "epoch": 0.4301559411377114, + "grad_norm": 2.0688107013702393, + "learning_rate": 1e-06, + "loss": 0.9511, + "mean_token_accuracy": 0.7143876552581787, + "num_tokens": 99570831.0, + "step": 3917 + }, + { + "epoch": 0.4302657588403251, + "grad_norm": 2.008279800415039, + "learning_rate": 1e-06, + "loss": 1.0185, + "mean_token_accuracy": 0.6971826553344727, + "num_tokens": 99602393.0, + "step": 3918 + }, + { + "epoch": 0.43037557654293873, + "grad_norm": 2.714184522628784, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.6995460987091064, + "num_tokens": 99620126.0, + "step": 3919 + }, + { + "epoch": 0.4304853942455524, + "grad_norm": 2.26690936088562, + "learning_rate": 1e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.6985304355621338, + "num_tokens": 99645229.0, + "step": 3920 + }, + { + "epoch": 0.430595211948166, + "grad_norm": 2.406637191772461, + "learning_rate": 1e-06, + "loss": 1.0614, + "mean_token_accuracy": 0.6770068407058716, + "num_tokens": 99669978.0, + "step": 3921 + }, + { + "epoch": 0.4307050296507797, + "grad_norm": 2.3350584506988525, + "learning_rate": 1e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.7081797122955322, + "num_tokens": 99695109.0, + "step": 3922 + }, + { + "epoch": 0.43081484735339337, + "grad_norm": 2.010401964187622, + "learning_rate": 1e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.6934977769851685, + "num_tokens": 99724854.0, + "step": 3923 + }, + { + "epoch": 0.430924665056007, + "grad_norm": 2.1008026599884033, + "learning_rate": 1e-06, + "loss": 1.0021, + "mean_token_accuracy": 0.6978615522384644, + "num_tokens": 99752165.0, + "step": 3924 + }, + { + "epoch": 0.43103448275862066, + "grad_norm": 2.2521700859069824, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7198407649993896, + "num_tokens": 99775232.0, + "step": 3925 + }, + { + "epoch": 0.43114430046123436, + "grad_norm": 2.0012145042419434, + "learning_rate": 1e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.7066189050674438, + "num_tokens": 99806320.0, + "step": 3926 + }, + { + "epoch": 0.431254118163848, + "grad_norm": 2.260857105255127, + "learning_rate": 1e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.696728527545929, + "num_tokens": 99831917.0, + "step": 3927 + }, + { + "epoch": 0.43136393586646166, + "grad_norm": 2.5265533924102783, + "learning_rate": 1e-06, + "loss": 1.0184, + "mean_token_accuracy": 0.6929565072059631, + "num_tokens": 99854557.0, + "step": 3928 + }, + { + "epoch": 0.43147375356907536, + "grad_norm": 2.3329477310180664, + "learning_rate": 1e-06, + "loss": 1.017, + "mean_token_accuracy": 0.6903976202011108, + "num_tokens": 99878437.0, + "step": 3929 + }, + { + "epoch": 0.431583571271689, + "grad_norm": 2.059264898300171, + "learning_rate": 1e-06, + "loss": 0.992, + "mean_token_accuracy": 0.6997319459915161, + "num_tokens": 99909179.0, + "step": 3930 + }, + { + "epoch": 0.43169338897430265, + "grad_norm": 2.622239828109741, + "learning_rate": 1e-06, + "loss": 0.9016, + "mean_token_accuracy": 0.7179038524627686, + "num_tokens": 99928156.0, + "step": 3931 + }, + { + "epoch": 0.4318032066769163, + "grad_norm": 2.039954662322998, + "learning_rate": 1e-06, + "loss": 1.0682, + "mean_token_accuracy": 0.6808058023452759, + "num_tokens": 99959171.0, + "step": 3932 + }, + { + "epoch": 0.43191302437953, + "grad_norm": 2.521465539932251, + "learning_rate": 1e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.7035531997680664, + "num_tokens": 99978835.0, + "step": 3933 + }, + { + "epoch": 0.43202284208214364, + "grad_norm": 2.3143227100372314, + "learning_rate": 1e-06, + "loss": 1.0228, + "mean_token_accuracy": 0.700718104839325, + "num_tokens": 100004583.0, + "step": 3934 + }, + { + "epoch": 0.4321326597847573, + "grad_norm": 2.096193790435791, + "learning_rate": 1e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.7232524156570435, + "num_tokens": 100031250.0, + "step": 3935 + }, + { + "epoch": 0.432242477487371, + "grad_norm": 2.4171817302703857, + "learning_rate": 1e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7144702672958374, + "num_tokens": 100050514.0, + "step": 3936 + }, + { + "epoch": 0.43235229518998464, + "grad_norm": 2.4535958766937256, + "learning_rate": 1e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.6959174871444702, + "num_tokens": 100072166.0, + "step": 3937 + }, + { + "epoch": 0.4324621128925983, + "grad_norm": 2.3105416297912598, + "learning_rate": 1e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.7147812247276306, + "num_tokens": 100095649.0, + "step": 3938 + }, + { + "epoch": 0.43257193059521193, + "grad_norm": 2.4080898761749268, + "learning_rate": 1e-06, + "loss": 1.001, + "mean_token_accuracy": 0.6956877708435059, + "num_tokens": 100117372.0, + "step": 3939 + }, + { + "epoch": 0.43268174829782563, + "grad_norm": 2.1205804347991943, + "learning_rate": 1e-06, + "loss": 0.9073, + "mean_token_accuracy": 0.721833348274231, + "num_tokens": 100143060.0, + "step": 3940 + }, + { + "epoch": 0.4327915660004393, + "grad_norm": 2.486961603164673, + "learning_rate": 1e-06, + "loss": 0.989, + "mean_token_accuracy": 0.6961763501167297, + "num_tokens": 100164895.0, + "step": 3941 + }, + { + "epoch": 0.4329013837030529, + "grad_norm": 2.5191969871520996, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7075669765472412, + "num_tokens": 100183982.0, + "step": 3942 + }, + { + "epoch": 0.43301120140566657, + "grad_norm": 2.3372340202331543, + "learning_rate": 1e-06, + "loss": 1.0234, + "mean_token_accuracy": 0.6962488293647766, + "num_tokens": 100207463.0, + "step": 3943 + }, + { + "epoch": 0.43312101910828027, + "grad_norm": 2.3168110847473145, + "learning_rate": 1e-06, + "loss": 0.9368, + "mean_token_accuracy": 0.7086520195007324, + "num_tokens": 100230478.0, + "step": 3944 + }, + { + "epoch": 0.4332308368108939, + "grad_norm": 2.504354953765869, + "learning_rate": 1e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.7031484842300415, + "num_tokens": 100250729.0, + "step": 3945 + }, + { + "epoch": 0.43334065451350756, + "grad_norm": 2.5868401527404785, + "learning_rate": 1e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.7178084850311279, + "num_tokens": 100269614.0, + "step": 3946 + }, + { + "epoch": 0.43345047221612126, + "grad_norm": 2.191542625427246, + "learning_rate": 1e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.6991509199142456, + "num_tokens": 100295269.0, + "step": 3947 + }, + { + "epoch": 0.4335602899187349, + "grad_norm": 2.2177281379699707, + "learning_rate": 1e-06, + "loss": 0.9378, + "mean_token_accuracy": 0.7146055698394775, + "num_tokens": 100321116.0, + "step": 3948 + }, + { + "epoch": 0.43367010762134856, + "grad_norm": 2.4342384338378906, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7184110879898071, + "num_tokens": 100342934.0, + "step": 3949 + }, + { + "epoch": 0.4337799253239622, + "grad_norm": 2.323164701461792, + "learning_rate": 1e-06, + "loss": 1.0187, + "mean_token_accuracy": 0.6912962794303894, + "num_tokens": 100366768.0, + "step": 3950 + }, + { + "epoch": 0.4338897430265759, + "grad_norm": 2.364499807357788, + "learning_rate": 1e-06, + "loss": 0.9105, + "mean_token_accuracy": 0.7204347252845764, + "num_tokens": 100388242.0, + "step": 3951 + }, + { + "epoch": 0.43399956072918955, + "grad_norm": 2.0764737129211426, + "learning_rate": 1e-06, + "loss": 1.1226, + "mean_token_accuracy": 0.6639370918273926, + "num_tokens": 100420239.0, + "step": 3952 + }, + { + "epoch": 0.4341093784318032, + "grad_norm": 2.7412915229797363, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.7166509032249451, + "num_tokens": 100438185.0, + "step": 3953 + }, + { + "epoch": 0.43421919613441684, + "grad_norm": 2.1985366344451904, + "learning_rate": 1e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.6931435465812683, + "num_tokens": 100464455.0, + "step": 3954 + }, + { + "epoch": 0.43432901383703054, + "grad_norm": 2.13725209236145, + "learning_rate": 1e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.7231652736663818, + "num_tokens": 100488828.0, + "step": 3955 + }, + { + "epoch": 0.4344388315396442, + "grad_norm": 2.549976348876953, + "learning_rate": 1e-06, + "loss": 0.9386, + "mean_token_accuracy": 0.7143036127090454, + "num_tokens": 100507892.0, + "step": 3956 + }, + { + "epoch": 0.43454864924225783, + "grad_norm": 2.203298807144165, + "learning_rate": 1e-06, + "loss": 1.0508, + "mean_token_accuracy": 0.6843580007553101, + "num_tokens": 100536118.0, + "step": 3957 + }, + { + "epoch": 0.43465846694487154, + "grad_norm": 2.3511850833892822, + "learning_rate": 1e-06, + "loss": 1.0506, + "mean_token_accuracy": 0.6862707138061523, + "num_tokens": 100559390.0, + "step": 3958 + }, + { + "epoch": 0.4347682846474852, + "grad_norm": 2.056180715560913, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.7086136937141418, + "num_tokens": 100587813.0, + "step": 3959 + }, + { + "epoch": 0.4348781023500988, + "grad_norm": 2.217390298843384, + "learning_rate": 1e-06, + "loss": 0.9736, + "mean_token_accuracy": 0.7046822309494019, + "num_tokens": 100614461.0, + "step": 3960 + }, + { + "epoch": 0.4349879200527125, + "grad_norm": 2.245826482772827, + "learning_rate": 1e-06, + "loss": 1.027, + "mean_token_accuracy": 0.6844769716262817, + "num_tokens": 100638806.0, + "step": 3961 + }, + { + "epoch": 0.4350977377553262, + "grad_norm": 2.5797624588012695, + "learning_rate": 1e-06, + "loss": 0.9743, + "mean_token_accuracy": 0.704146146774292, + "num_tokens": 100658706.0, + "step": 3962 + }, + { + "epoch": 0.4352075554579398, + "grad_norm": 2.5206234455108643, + "learning_rate": 1e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.7338747382164001, + "num_tokens": 100677974.0, + "step": 3963 + }, + { + "epoch": 0.43531737316055347, + "grad_norm": 2.195436954498291, + "learning_rate": 1e-06, + "loss": 1.013, + "mean_token_accuracy": 0.6957892179489136, + "num_tokens": 100703371.0, + "step": 3964 + }, + { + "epoch": 0.43542719086316717, + "grad_norm": 2.2061824798583984, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7021721005439758, + "num_tokens": 100729188.0, + "step": 3965 + }, + { + "epoch": 0.4355370085657808, + "grad_norm": 2.207763910293579, + "learning_rate": 1e-06, + "loss": 1.0384, + "mean_token_accuracy": 0.6867674589157104, + "num_tokens": 100756324.0, + "step": 3966 + }, + { + "epoch": 0.43564682626839446, + "grad_norm": 2.0972061157226562, + "learning_rate": 1e-06, + "loss": 1.077, + "mean_token_accuracy": 0.6770813465118408, + "num_tokens": 100785900.0, + "step": 3967 + }, + { + "epoch": 0.4357566439710081, + "grad_norm": 2.1433868408203125, + "learning_rate": 1e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7150998115539551, + "num_tokens": 100812692.0, + "step": 3968 + }, + { + "epoch": 0.4358664616736218, + "grad_norm": 2.150709629058838, + "learning_rate": 1e-06, + "loss": 1.0305, + "mean_token_accuracy": 0.6903296709060669, + "num_tokens": 100840520.0, + "step": 3969 + }, + { + "epoch": 0.43597627937623545, + "grad_norm": 2.309083938598633, + "learning_rate": 1e-06, + "loss": 1.0398, + "mean_token_accuracy": 0.6886179447174072, + "num_tokens": 100866168.0, + "step": 3970 + }, + { + "epoch": 0.4360860970788491, + "grad_norm": 2.5880892276763916, + "learning_rate": 1e-06, + "loss": 0.8731, + "mean_token_accuracy": 0.7280347347259521, + "num_tokens": 100884821.0, + "step": 3971 + }, + { + "epoch": 0.43619591478146275, + "grad_norm": 2.1263675689697266, + "learning_rate": 1e-06, + "loss": 0.937, + "mean_token_accuracy": 0.7090115547180176, + "num_tokens": 100910503.0, + "step": 3972 + }, + { + "epoch": 0.43630573248407645, + "grad_norm": 1.9507763385772705, + "learning_rate": 1e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.7010855078697205, + "num_tokens": 100941846.0, + "step": 3973 + }, + { + "epoch": 0.4364155501866901, + "grad_norm": 2.169219493865967, + "learning_rate": 1e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.7072417736053467, + "num_tokens": 100967063.0, + "step": 3974 + }, + { + "epoch": 0.43652536788930374, + "grad_norm": 2.1866374015808105, + "learning_rate": 1e-06, + "loss": 1.134, + "mean_token_accuracy": 0.6706458330154419, + "num_tokens": 100992249.0, + "step": 3975 + }, + { + "epoch": 0.43663518559191744, + "grad_norm": 2.3896331787109375, + "learning_rate": 1e-06, + "loss": 0.9279, + "mean_token_accuracy": 0.7144614458084106, + "num_tokens": 101014287.0, + "step": 3976 + }, + { + "epoch": 0.4367450032945311, + "grad_norm": 2.4771270751953125, + "learning_rate": 1e-06, + "loss": 0.9249, + "mean_token_accuracy": 0.7212384939193726, + "num_tokens": 101036284.0, + "step": 3977 + }, + { + "epoch": 0.43685482099714473, + "grad_norm": 2.1368041038513184, + "learning_rate": 1e-06, + "loss": 1.0367, + "mean_token_accuracy": 0.6866413354873657, + "num_tokens": 101065779.0, + "step": 3978 + }, + { + "epoch": 0.4369646386997584, + "grad_norm": 2.461404323577881, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.7167147397994995, + "num_tokens": 101085442.0, + "step": 3979 + }, + { + "epoch": 0.4370744564023721, + "grad_norm": 2.3138277530670166, + "learning_rate": 1e-06, + "loss": 0.8622, + "mean_token_accuracy": 0.7287107706069946, + "num_tokens": 101106854.0, + "step": 3980 + }, + { + "epoch": 0.4371842741049857, + "grad_norm": 2.18523907661438, + "learning_rate": 1e-06, + "loss": 0.9441, + "mean_token_accuracy": 0.7072195410728455, + "num_tokens": 101131834.0, + "step": 3981 + }, + { + "epoch": 0.43729409180759937, + "grad_norm": 2.2223751544952393, + "learning_rate": 1e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.7118380069732666, + "num_tokens": 101157921.0, + "step": 3982 + }, + { + "epoch": 0.4374039095102131, + "grad_norm": 2.273996591567993, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.7018226981163025, + "num_tokens": 101181667.0, + "step": 3983 + }, + { + "epoch": 0.4375137272128267, + "grad_norm": 2.1209867000579834, + "learning_rate": 1e-06, + "loss": 0.9915, + "mean_token_accuracy": 0.7070000171661377, + "num_tokens": 101207259.0, + "step": 3984 + }, + { + "epoch": 0.43762354491544037, + "grad_norm": 2.478329658508301, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.7039496898651123, + "num_tokens": 101229527.0, + "step": 3985 + }, + { + "epoch": 0.437733362618054, + "grad_norm": 2.1813340187072754, + "learning_rate": 1e-06, + "loss": 1.0431, + "mean_token_accuracy": 0.6947191953659058, + "num_tokens": 101255274.0, + "step": 3986 + }, + { + "epoch": 0.4378431803206677, + "grad_norm": 2.3544671535491943, + "learning_rate": 1e-06, + "loss": 1.0214, + "mean_token_accuracy": 0.6884822249412537, + "num_tokens": 101279396.0, + "step": 3987 + }, + { + "epoch": 0.43795299802328136, + "grad_norm": 2.4844253063201904, + "learning_rate": 1e-06, + "loss": 1.019, + "mean_token_accuracy": 0.6979959011077881, + "num_tokens": 101301132.0, + "step": 3988 + }, + { + "epoch": 0.438062815725895, + "grad_norm": 2.194286823272705, + "learning_rate": 1e-06, + "loss": 0.9229, + "mean_token_accuracy": 0.7277951836585999, + "num_tokens": 101326584.0, + "step": 3989 + }, + { + "epoch": 0.43817263342850865, + "grad_norm": 2.23557448387146, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.6884722113609314, + "num_tokens": 101351352.0, + "step": 3990 + }, + { + "epoch": 0.43828245113112235, + "grad_norm": 2.3495535850524902, + "learning_rate": 1e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.7163057327270508, + "num_tokens": 101374377.0, + "step": 3991 + }, + { + "epoch": 0.438392268833736, + "grad_norm": 2.1585190296173096, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7073808908462524, + "num_tokens": 101401286.0, + "step": 3992 + }, + { + "epoch": 0.43850208653634964, + "grad_norm": 2.2409956455230713, + "learning_rate": 1e-06, + "loss": 1.0151, + "mean_token_accuracy": 0.6935980319976807, + "num_tokens": 101427049.0, + "step": 3993 + }, + { + "epoch": 0.43861190423896335, + "grad_norm": 1.8233195543289185, + "learning_rate": 1e-06, + "loss": 0.9594, + "mean_token_accuracy": 0.7068516612052917, + "num_tokens": 101462516.0, + "step": 3994 + }, + { + "epoch": 0.438721721941577, + "grad_norm": 2.5513617992401123, + "learning_rate": 1e-06, + "loss": 1.0542, + "mean_token_accuracy": 0.7044520378112793, + "num_tokens": 101483331.0, + "step": 3995 + }, + { + "epoch": 0.43883153964419064, + "grad_norm": 2.3967936038970947, + "learning_rate": 1e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.7146746516227722, + "num_tokens": 101504662.0, + "step": 3996 + }, + { + "epoch": 0.4389413573468043, + "grad_norm": 2.509214401245117, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7072800993919373, + "num_tokens": 101524504.0, + "step": 3997 + }, + { + "epoch": 0.439051175049418, + "grad_norm": 2.398426055908203, + "learning_rate": 1e-06, + "loss": 1.0476, + "mean_token_accuracy": 0.6828947067260742, + "num_tokens": 101548494.0, + "step": 3998 + }, + { + "epoch": 0.43916099275203163, + "grad_norm": 2.4452598094940186, + "learning_rate": 1e-06, + "loss": 0.9806, + "mean_token_accuracy": 0.7066872119903564, + "num_tokens": 101569817.0, + "step": 3999 + }, + { + "epoch": 0.4392708104546453, + "grad_norm": 2.1906497478485107, + "learning_rate": 1e-06, + "loss": 1.1353, + "mean_token_accuracy": 0.6718039512634277, + "num_tokens": 101598599.0, + "step": 4000 + }, + { + "epoch": 0.4393806281572589, + "grad_norm": 2.233276128768921, + "learning_rate": 1e-06, + "loss": 1.0864, + "mean_token_accuracy": 0.6744009852409363, + "num_tokens": 101625427.0, + "step": 4001 + }, + { + "epoch": 0.4394904458598726, + "grad_norm": 2.3640685081481934, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.6957105398178101, + "num_tokens": 101648691.0, + "step": 4002 + }, + { + "epoch": 0.43960026356248627, + "grad_norm": 2.2235312461853027, + "learning_rate": 1e-06, + "loss": 1.054, + "mean_token_accuracy": 0.6886293888092041, + "num_tokens": 101677380.0, + "step": 4003 + }, + { + "epoch": 0.4397100812650999, + "grad_norm": 2.471287488937378, + "learning_rate": 1e-06, + "loss": 0.976, + "mean_token_accuracy": 0.6981594562530518, + "num_tokens": 101698204.0, + "step": 4004 + }, + { + "epoch": 0.4398198989677136, + "grad_norm": 2.232470750808716, + "learning_rate": 1e-06, + "loss": 1.0171, + "mean_token_accuracy": 0.6923301219940186, + "num_tokens": 101723689.0, + "step": 4005 + }, + { + "epoch": 0.43992971667032726, + "grad_norm": 2.2813475131988525, + "learning_rate": 1e-06, + "loss": 0.991, + "mean_token_accuracy": 0.7050421237945557, + "num_tokens": 101748775.0, + "step": 4006 + }, + { + "epoch": 0.4400395343729409, + "grad_norm": 1.9984084367752075, + "learning_rate": 1e-06, + "loss": 0.9628, + "mean_token_accuracy": 0.7103757858276367, + "num_tokens": 101777829.0, + "step": 4007 + }, + { + "epoch": 0.44014935207555456, + "grad_norm": 2.6069700717926025, + "learning_rate": 1e-06, + "loss": 0.9635, + "mean_token_accuracy": 0.6986033916473389, + "num_tokens": 101798231.0, + "step": 4008 + }, + { + "epoch": 0.44025916977816826, + "grad_norm": 2.3046457767486572, + "learning_rate": 1e-06, + "loss": 0.9955, + "mean_token_accuracy": 0.7020620107650757, + "num_tokens": 101824441.0, + "step": 4009 + }, + { + "epoch": 0.4403689874807819, + "grad_norm": 2.0127735137939453, + "learning_rate": 1e-06, + "loss": 1.026, + "mean_token_accuracy": 0.6942888498306274, + "num_tokens": 101855514.0, + "step": 4010 + }, + { + "epoch": 0.44047880518339555, + "grad_norm": 2.1398537158966064, + "learning_rate": 1e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.7013692259788513, + "num_tokens": 101885001.0, + "step": 4011 + }, + { + "epoch": 0.44058862288600925, + "grad_norm": 2.1382570266723633, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.6977971792221069, + "num_tokens": 101909994.0, + "step": 4012 + }, + { + "epoch": 0.4406984405886229, + "grad_norm": 2.4477734565734863, + "learning_rate": 1e-06, + "loss": 0.986, + "mean_token_accuracy": 0.7062975168228149, + "num_tokens": 101930139.0, + "step": 4013 + }, + { + "epoch": 0.44080825829123654, + "grad_norm": 2.170928716659546, + "learning_rate": 1e-06, + "loss": 0.9991, + "mean_token_accuracy": 0.6972839832305908, + "num_tokens": 101954910.0, + "step": 4014 + }, + { + "epoch": 0.4409180759938502, + "grad_norm": 2.038529634475708, + "learning_rate": 1e-06, + "loss": 1.0317, + "mean_token_accuracy": 0.6963491439819336, + "num_tokens": 101985001.0, + "step": 4015 + }, + { + "epoch": 0.4410278936964639, + "grad_norm": 2.2949819564819336, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7227633595466614, + "num_tokens": 102008374.0, + "step": 4016 + }, + { + "epoch": 0.44113771139907754, + "grad_norm": 2.4028170108795166, + "learning_rate": 1e-06, + "loss": 0.8076, + "mean_token_accuracy": 0.7469164729118347, + "num_tokens": 102029844.0, + "step": 4017 + }, + { + "epoch": 0.4412475291016912, + "grad_norm": 2.0133190155029297, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.7013627290725708, + "num_tokens": 102058954.0, + "step": 4018 + }, + { + "epoch": 0.44135734680430483, + "grad_norm": 2.2594704627990723, + "learning_rate": 1e-06, + "loss": 1.0074, + "mean_token_accuracy": 0.6967337131500244, + "num_tokens": 102084916.0, + "step": 4019 + }, + { + "epoch": 0.44146716450691853, + "grad_norm": 2.1590654850006104, + "learning_rate": 1e-06, + "loss": 0.8188, + "mean_token_accuracy": 0.7468405365943909, + "num_tokens": 102110933.0, + "step": 4020 + }, + { + "epoch": 0.4415769822095322, + "grad_norm": 2.6658430099487305, + "learning_rate": 1e-06, + "loss": 0.8802, + "mean_token_accuracy": 0.7217997312545776, + "num_tokens": 102127676.0, + "step": 4021 + }, + { + "epoch": 0.4416867999121458, + "grad_norm": 2.157822608947754, + "learning_rate": 1e-06, + "loss": 1.0216, + "mean_token_accuracy": 0.7002219557762146, + "num_tokens": 102153228.0, + "step": 4022 + }, + { + "epoch": 0.4417966176147595, + "grad_norm": 2.100951671600342, + "learning_rate": 1e-06, + "loss": 1.0486, + "mean_token_accuracy": 0.6799278259277344, + "num_tokens": 102182344.0, + "step": 4023 + }, + { + "epoch": 0.44190643531737317, + "grad_norm": 2.636218547821045, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7098093628883362, + "num_tokens": 102199081.0, + "step": 4024 + }, + { + "epoch": 0.4420162530199868, + "grad_norm": 2.2639060020446777, + "learning_rate": 1e-06, + "loss": 1.0149, + "mean_token_accuracy": 0.6959244012832642, + "num_tokens": 102225562.0, + "step": 4025 + }, + { + "epoch": 0.44212607072260046, + "grad_norm": 2.3354341983795166, + "learning_rate": 1e-06, + "loss": 1.0529, + "mean_token_accuracy": 0.6807447075843811, + "num_tokens": 102250620.0, + "step": 4026 + }, + { + "epoch": 0.44223588842521416, + "grad_norm": 2.6126880645751953, + "learning_rate": 1e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.7084270119667053, + "num_tokens": 102270367.0, + "step": 4027 + }, + { + "epoch": 0.4423457061278278, + "grad_norm": 2.3086166381835938, + "learning_rate": 1e-06, + "loss": 0.922, + "mean_token_accuracy": 0.714361310005188, + "num_tokens": 102292473.0, + "step": 4028 + }, + { + "epoch": 0.44245552383044146, + "grad_norm": 2.3534016609191895, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.7071282863616943, + "num_tokens": 102315479.0, + "step": 4029 + }, + { + "epoch": 0.4425653415330551, + "grad_norm": 1.9684234857559204, + "learning_rate": 1e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7158234119415283, + "num_tokens": 102347098.0, + "step": 4030 + }, + { + "epoch": 0.4426751592356688, + "grad_norm": 2.127957344055176, + "learning_rate": 1e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.6934293508529663, + "num_tokens": 102375941.0, + "step": 4031 + }, + { + "epoch": 0.44278497693828245, + "grad_norm": 2.024388074874878, + "learning_rate": 1e-06, + "loss": 1.0191, + "mean_token_accuracy": 0.6981291770935059, + "num_tokens": 102405276.0, + "step": 4032 + }, + { + "epoch": 0.4428947946408961, + "grad_norm": 3.0213546752929688, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7013517618179321, + "num_tokens": 102420998.0, + "step": 4033 + }, + { + "epoch": 0.4430046123435098, + "grad_norm": 2.3169143199920654, + "learning_rate": 1e-06, + "loss": 1.0051, + "mean_token_accuracy": 0.7057836651802063, + "num_tokens": 102444815.0, + "step": 4034 + }, + { + "epoch": 0.44311443004612344, + "grad_norm": 2.2942087650299072, + "learning_rate": 1e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7031109929084778, + "num_tokens": 102468948.0, + "step": 4035 + }, + { + "epoch": 0.4432242477487371, + "grad_norm": 2.2444887161254883, + "learning_rate": 1e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.7092447280883789, + "num_tokens": 102493559.0, + "step": 4036 + }, + { + "epoch": 0.44333406545135073, + "grad_norm": 2.186723232269287, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.6932501792907715, + "num_tokens": 102520255.0, + "step": 4037 + }, + { + "epoch": 0.44344388315396444, + "grad_norm": 2.3439345359802246, + "learning_rate": 1e-06, + "loss": 1.0071, + "mean_token_accuracy": 0.6957992315292358, + "num_tokens": 102545021.0, + "step": 4038 + }, + { + "epoch": 0.4435537008565781, + "grad_norm": 2.4069390296936035, + "learning_rate": 1e-06, + "loss": 1.0724, + "mean_token_accuracy": 0.6871538162231445, + "num_tokens": 102567095.0, + "step": 4039 + }, + { + "epoch": 0.4436635185591917, + "grad_norm": 1.771614909172058, + "learning_rate": 1e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.70330810546875, + "num_tokens": 102602716.0, + "step": 4040 + }, + { + "epoch": 0.44377333626180543, + "grad_norm": 2.3605291843414307, + "learning_rate": 1e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.7192016839981079, + "num_tokens": 102626318.0, + "step": 4041 + }, + { + "epoch": 0.4438831539644191, + "grad_norm": 2.3518688678741455, + "learning_rate": 1e-06, + "loss": 0.9856, + "mean_token_accuracy": 0.7003598213195801, + "num_tokens": 102649950.0, + "step": 4042 + }, + { + "epoch": 0.4439929716670327, + "grad_norm": 2.1632397174835205, + "learning_rate": 1e-06, + "loss": 1.0058, + "mean_token_accuracy": 0.6923574805259705, + "num_tokens": 102676197.0, + "step": 4043 + }, + { + "epoch": 0.44410278936964637, + "grad_norm": 2.3189263343811035, + "learning_rate": 1e-06, + "loss": 0.8645, + "mean_token_accuracy": 0.7295572757720947, + "num_tokens": 102696731.0, + "step": 4044 + }, + { + "epoch": 0.44421260707226007, + "grad_norm": 2.427748203277588, + "learning_rate": 1e-06, + "loss": 0.9457, + "mean_token_accuracy": 0.7074146270751953, + "num_tokens": 102718810.0, + "step": 4045 + }, + { + "epoch": 0.4443224247748737, + "grad_norm": 2.445526599884033, + "learning_rate": 1e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.691835880279541, + "num_tokens": 102741393.0, + "step": 4046 + }, + { + "epoch": 0.44443224247748736, + "grad_norm": 2.2958977222442627, + "learning_rate": 1e-06, + "loss": 1.0523, + "mean_token_accuracy": 0.693877100944519, + "num_tokens": 102767433.0, + "step": 4047 + }, + { + "epoch": 0.444542060180101, + "grad_norm": 2.3077280521392822, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7041817903518677, + "num_tokens": 102789257.0, + "step": 4048 + }, + { + "epoch": 0.4446518778827147, + "grad_norm": 2.052557945251465, + "learning_rate": 1e-06, + "loss": 1.0221, + "mean_token_accuracy": 0.7058224678039551, + "num_tokens": 102819008.0, + "step": 4049 + }, + { + "epoch": 0.44476169558532835, + "grad_norm": 2.2652711868286133, + "learning_rate": 1e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.7014669179916382, + "num_tokens": 102843958.0, + "step": 4050 + }, + { + "epoch": 0.444871513287942, + "grad_norm": 2.371746778488159, + "learning_rate": 1e-06, + "loss": 1.025, + "mean_token_accuracy": 0.6857458353042603, + "num_tokens": 102866403.0, + "step": 4051 + }, + { + "epoch": 0.4449813309905557, + "grad_norm": 2.254197835922241, + "learning_rate": 1e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.7034392356872559, + "num_tokens": 102890729.0, + "step": 4052 + }, + { + "epoch": 0.44509114869316935, + "grad_norm": 2.0875725746154785, + "learning_rate": 1e-06, + "loss": 1.0195, + "mean_token_accuracy": 0.6917345523834229, + "num_tokens": 102919190.0, + "step": 4053 + }, + { + "epoch": 0.445200966395783, + "grad_norm": 2.1768155097961426, + "learning_rate": 1e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.7200098037719727, + "num_tokens": 102946154.0, + "step": 4054 + }, + { + "epoch": 0.44531078409839664, + "grad_norm": 2.277728796005249, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7082318067550659, + "num_tokens": 102970950.0, + "step": 4055 + }, + { + "epoch": 0.44542060180101034, + "grad_norm": 2.1130874156951904, + "learning_rate": 1e-06, + "loss": 0.9362, + "mean_token_accuracy": 0.7153822183609009, + "num_tokens": 102998503.0, + "step": 4056 + }, + { + "epoch": 0.445530419503624, + "grad_norm": 2.2495596408843994, + "learning_rate": 1e-06, + "loss": 1.0299, + "mean_token_accuracy": 0.6883231401443481, + "num_tokens": 103022911.0, + "step": 4057 + }, + { + "epoch": 0.44564023720623763, + "grad_norm": 2.334362268447876, + "learning_rate": 1e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7122635245323181, + "num_tokens": 103045794.0, + "step": 4058 + }, + { + "epoch": 0.44575005490885133, + "grad_norm": 2.493687152862549, + "learning_rate": 1e-06, + "loss": 1.0159, + "mean_token_accuracy": 0.6827530860900879, + "num_tokens": 103068336.0, + "step": 4059 + }, + { + "epoch": 0.445859872611465, + "grad_norm": 2.2679874897003174, + "learning_rate": 1e-06, + "loss": 1.0559, + "mean_token_accuracy": 0.6904813051223755, + "num_tokens": 103094769.0, + "step": 4060 + }, + { + "epoch": 0.4459696903140786, + "grad_norm": 2.2996668815612793, + "learning_rate": 1e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7011697888374329, + "num_tokens": 103119177.0, + "step": 4061 + }, + { + "epoch": 0.44607950801669227, + "grad_norm": 2.222111701965332, + "learning_rate": 1e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.7090226411819458, + "num_tokens": 103145163.0, + "step": 4062 + }, + { + "epoch": 0.446189325719306, + "grad_norm": 2.408083915710449, + "learning_rate": 1e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.7149665355682373, + "num_tokens": 103167029.0, + "step": 4063 + }, + { + "epoch": 0.4462991434219196, + "grad_norm": 2.382720470428467, + "learning_rate": 1e-06, + "loss": 1.0163, + "mean_token_accuracy": 0.6941905617713928, + "num_tokens": 103191284.0, + "step": 4064 + }, + { + "epoch": 0.44640896112453327, + "grad_norm": 2.1413443088531494, + "learning_rate": 1e-06, + "loss": 0.958, + "mean_token_accuracy": 0.7053043842315674, + "num_tokens": 103218850.0, + "step": 4065 + }, + { + "epoch": 0.4465187788271469, + "grad_norm": 2.0722544193267822, + "learning_rate": 1e-06, + "loss": 1.0447, + "mean_token_accuracy": 0.6822326183319092, + "num_tokens": 103247045.0, + "step": 4066 + }, + { + "epoch": 0.4466285965297606, + "grad_norm": 2.0705649852752686, + "learning_rate": 1e-06, + "loss": 1.0275, + "mean_token_accuracy": 0.6839836835861206, + "num_tokens": 103275768.0, + "step": 4067 + }, + { + "epoch": 0.44673841423237426, + "grad_norm": 2.393871307373047, + "learning_rate": 1e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.7058879733085632, + "num_tokens": 103298344.0, + "step": 4068 + }, + { + "epoch": 0.4468482319349879, + "grad_norm": 2.8453903198242188, + "learning_rate": 1e-06, + "loss": 0.8401, + "mean_token_accuracy": 0.7397297620773315, + "num_tokens": 103315974.0, + "step": 4069 + }, + { + "epoch": 0.4469580496376016, + "grad_norm": 2.1759068965911865, + "learning_rate": 1e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.7028789520263672, + "num_tokens": 103344869.0, + "step": 4070 + }, + { + "epoch": 0.44706786734021525, + "grad_norm": 2.368042230606079, + "learning_rate": 1e-06, + "loss": 1.0721, + "mean_token_accuracy": 0.6801438331604004, + "num_tokens": 103369732.0, + "step": 4071 + }, + { + "epoch": 0.4471776850428289, + "grad_norm": 2.2757418155670166, + "learning_rate": 1e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.7037255764007568, + "num_tokens": 103393294.0, + "step": 4072 + }, + { + "epoch": 0.44728750274544254, + "grad_norm": 2.289870500564575, + "learning_rate": 1e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.7093410491943359, + "num_tokens": 103417296.0, + "step": 4073 + }, + { + "epoch": 0.44739732044805625, + "grad_norm": 2.4405617713928223, + "learning_rate": 1e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.7094510793685913, + "num_tokens": 103440312.0, + "step": 4074 + }, + { + "epoch": 0.4475071381506699, + "grad_norm": 2.4113574028015137, + "learning_rate": 1e-06, + "loss": 1.0773, + "mean_token_accuracy": 0.6773221492767334, + "num_tokens": 103465234.0, + "step": 4075 + }, + { + "epoch": 0.44761695585328354, + "grad_norm": 2.4141182899475098, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7166049480438232, + "num_tokens": 103488534.0, + "step": 4076 + }, + { + "epoch": 0.4477267735558972, + "grad_norm": 2.5518274307250977, + "learning_rate": 1e-06, + "loss": 0.918, + "mean_token_accuracy": 0.7174221873283386, + "num_tokens": 103509318.0, + "step": 4077 + }, + { + "epoch": 0.4478365912585109, + "grad_norm": 2.2873587608337402, + "learning_rate": 1e-06, + "loss": 0.9736, + "mean_token_accuracy": 0.7054398059844971, + "num_tokens": 103534502.0, + "step": 4078 + }, + { + "epoch": 0.44794640896112453, + "grad_norm": 2.1348395347595215, + "learning_rate": 1e-06, + "loss": 1.0833, + "mean_token_accuracy": 0.6851780414581299, + "num_tokens": 103565075.0, + "step": 4079 + }, + { + "epoch": 0.4480562266637382, + "grad_norm": 2.352975368499756, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7024519443511963, + "num_tokens": 103589632.0, + "step": 4080 + }, + { + "epoch": 0.4481660443663519, + "grad_norm": 2.4587438106536865, + "learning_rate": 1e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.711434006690979, + "num_tokens": 103610262.0, + "step": 4081 + }, + { + "epoch": 0.4482758620689655, + "grad_norm": 2.271266222000122, + "learning_rate": 1e-06, + "loss": 1.0356, + "mean_token_accuracy": 0.6906313896179199, + "num_tokens": 103634733.0, + "step": 4082 + }, + { + "epoch": 0.44838567977157917, + "grad_norm": 2.4497528076171875, + "learning_rate": 1e-06, + "loss": 0.9056, + "mean_token_accuracy": 0.7280747890472412, + "num_tokens": 103655260.0, + "step": 4083 + }, + { + "epoch": 0.4484954974741928, + "grad_norm": 2.2025437355041504, + "learning_rate": 1e-06, + "loss": 0.9409, + "mean_token_accuracy": 0.7083929777145386, + "num_tokens": 103680530.0, + "step": 4084 + }, + { + "epoch": 0.4486053151768065, + "grad_norm": 2.2313711643218994, + "learning_rate": 1e-06, + "loss": 1.0245, + "mean_token_accuracy": 0.6947002410888672, + "num_tokens": 103705571.0, + "step": 4085 + }, + { + "epoch": 0.44871513287942016, + "grad_norm": 2.1922032833099365, + "learning_rate": 1e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.6988019943237305, + "num_tokens": 103731847.0, + "step": 4086 + }, + { + "epoch": 0.4488249505820338, + "grad_norm": 2.2713100910186768, + "learning_rate": 1e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.7158571481704712, + "num_tokens": 103756750.0, + "step": 4087 + }, + { + "epoch": 0.4489347682846475, + "grad_norm": 2.2359063625335693, + "learning_rate": 1e-06, + "loss": 0.9672, + "mean_token_accuracy": 0.7039921283721924, + "num_tokens": 103779626.0, + "step": 4088 + }, + { + "epoch": 0.44904458598726116, + "grad_norm": 2.48947811126709, + "learning_rate": 1e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.72102952003479, + "num_tokens": 103799688.0, + "step": 4089 + }, + { + "epoch": 0.4491544036898748, + "grad_norm": 2.511880397796631, + "learning_rate": 1e-06, + "loss": 0.9872, + "mean_token_accuracy": 0.7000168561935425, + "num_tokens": 103821191.0, + "step": 4090 + }, + { + "epoch": 0.44926422139248845, + "grad_norm": 2.14570951461792, + "learning_rate": 1e-06, + "loss": 1.0442, + "mean_token_accuracy": 0.683417797088623, + "num_tokens": 103849802.0, + "step": 4091 + }, + { + "epoch": 0.44937403909510215, + "grad_norm": 1.8125834465026855, + "learning_rate": 1e-06, + "loss": 0.9935, + "mean_token_accuracy": 0.6990577578544617, + "num_tokens": 103887317.0, + "step": 4092 + }, + { + "epoch": 0.4494838567977158, + "grad_norm": 2.194302797317505, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7166633009910583, + "num_tokens": 103915016.0, + "step": 4093 + }, + { + "epoch": 0.44959367450032944, + "grad_norm": 2.2065277099609375, + "learning_rate": 1e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.7121998071670532, + "num_tokens": 103940027.0, + "step": 4094 + }, + { + "epoch": 0.4497034922029431, + "grad_norm": 2.0933427810668945, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7115143537521362, + "num_tokens": 103968201.0, + "step": 4095 + }, + { + "epoch": 0.4498133099055568, + "grad_norm": 2.178495168685913, + "learning_rate": 1e-06, + "loss": 0.8499, + "mean_token_accuracy": 0.733013927936554, + "num_tokens": 103992693.0, + "step": 4096 + }, + { + "epoch": 0.44992312760817044, + "grad_norm": 2.2625021934509277, + "learning_rate": 1e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7073365449905396, + "num_tokens": 104018741.0, + "step": 4097 + }, + { + "epoch": 0.4500329453107841, + "grad_norm": 2.1557250022888184, + "learning_rate": 1e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.7206329703330994, + "num_tokens": 104042456.0, + "step": 4098 + }, + { + "epoch": 0.4501427630133978, + "grad_norm": 1.9795958995819092, + "learning_rate": 1e-06, + "loss": 0.9933, + "mean_token_accuracy": 0.6944370269775391, + "num_tokens": 104071979.0, + "step": 4099 + }, + { + "epoch": 0.45025258071601143, + "grad_norm": 2.2807981967926025, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.7067753076553345, + "num_tokens": 104098837.0, + "step": 4100 + }, + { + "epoch": 0.4503623984186251, + "grad_norm": 2.1388025283813477, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7108763456344604, + "num_tokens": 104127181.0, + "step": 4101 + }, + { + "epoch": 0.4504722161212387, + "grad_norm": 1.9625002145767212, + "learning_rate": 1e-06, + "loss": 0.9014, + "mean_token_accuracy": 0.724113941192627, + "num_tokens": 104154453.0, + "step": 4102 + }, + { + "epoch": 0.4505820338238524, + "grad_norm": 2.087547540664673, + "learning_rate": 1e-06, + "loss": 1.0234, + "mean_token_accuracy": 0.6931132674217224, + "num_tokens": 104182768.0, + "step": 4103 + }, + { + "epoch": 0.45069185152646607, + "grad_norm": 2.1294565200805664, + "learning_rate": 1e-06, + "loss": 1.0282, + "mean_token_accuracy": 0.689848780632019, + "num_tokens": 104210356.0, + "step": 4104 + }, + { + "epoch": 0.4508016692290797, + "grad_norm": 2.3909506797790527, + "learning_rate": 1e-06, + "loss": 1.0435, + "mean_token_accuracy": 0.6878582835197449, + "num_tokens": 104233184.0, + "step": 4105 + }, + { + "epoch": 0.45091148693169336, + "grad_norm": 2.3499839305877686, + "learning_rate": 1e-06, + "loss": 0.9628, + "mean_token_accuracy": 0.7144712805747986, + "num_tokens": 104254245.0, + "step": 4106 + }, + { + "epoch": 0.45102130463430706, + "grad_norm": 2.048301935195923, + "learning_rate": 1e-06, + "loss": 1.0483, + "mean_token_accuracy": 0.698505163192749, + "num_tokens": 104282552.0, + "step": 4107 + }, + { + "epoch": 0.4511311223369207, + "grad_norm": 2.398853063583374, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7215416431427002, + "num_tokens": 104305686.0, + "step": 4108 + }, + { + "epoch": 0.45124094003953436, + "grad_norm": 2.2706243991851807, + "learning_rate": 1e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.7327682375907898, + "num_tokens": 104330328.0, + "step": 4109 + }, + { + "epoch": 0.45135075774214806, + "grad_norm": 2.342071771621704, + "learning_rate": 1e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.7054821848869324, + "num_tokens": 104352982.0, + "step": 4110 + }, + { + "epoch": 0.4514605754447617, + "grad_norm": 2.2450013160705566, + "learning_rate": 1e-06, + "loss": 1.0054, + "mean_token_accuracy": 0.6926158666610718, + "num_tokens": 104379452.0, + "step": 4111 + }, + { + "epoch": 0.45157039314737535, + "grad_norm": 2.067833662033081, + "learning_rate": 1e-06, + "loss": 1.0167, + "mean_token_accuracy": 0.7013750076293945, + "num_tokens": 104408178.0, + "step": 4112 + }, + { + "epoch": 0.451680210849989, + "grad_norm": 2.4752562046051025, + "learning_rate": 1e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.6978076696395874, + "num_tokens": 104430015.0, + "step": 4113 + }, + { + "epoch": 0.4517900285526027, + "grad_norm": 1.9837708473205566, + "learning_rate": 1e-06, + "loss": 1.0344, + "mean_token_accuracy": 0.6867313981056213, + "num_tokens": 104462883.0, + "step": 4114 + }, + { + "epoch": 0.45189984625521634, + "grad_norm": 1.9462817907333374, + "learning_rate": 1e-06, + "loss": 1.0841, + "mean_token_accuracy": 0.6761660575866699, + "num_tokens": 104498850.0, + "step": 4115 + }, + { + "epoch": 0.45200966395783, + "grad_norm": 2.3353068828582764, + "learning_rate": 1e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.6980177164077759, + "num_tokens": 104522119.0, + "step": 4116 + }, + { + "epoch": 0.4521194816604437, + "grad_norm": 2.1264593601226807, + "learning_rate": 1e-06, + "loss": 1.015, + "mean_token_accuracy": 0.696226954460144, + "num_tokens": 104550017.0, + "step": 4117 + }, + { + "epoch": 0.45222929936305734, + "grad_norm": 2.230945348739624, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7074017524719238, + "num_tokens": 104573900.0, + "step": 4118 + }, + { + "epoch": 0.452339117065671, + "grad_norm": 2.068652391433716, + "learning_rate": 1e-06, + "loss": 1.0472, + "mean_token_accuracy": 0.6827725172042847, + "num_tokens": 104604663.0, + "step": 4119 + }, + { + "epoch": 0.4524489347682846, + "grad_norm": 2.0579984188079834, + "learning_rate": 1e-06, + "loss": 1.0279, + "mean_token_accuracy": 0.6901822090148926, + "num_tokens": 104633180.0, + "step": 4120 + }, + { + "epoch": 0.45255875247089833, + "grad_norm": 2.14316987991333, + "learning_rate": 1e-06, + "loss": 0.8866, + "mean_token_accuracy": 0.7283152937889099, + "num_tokens": 104657039.0, + "step": 4121 + }, + { + "epoch": 0.452668570173512, + "grad_norm": 2.06636381149292, + "learning_rate": 1e-06, + "loss": 1.1589, + "mean_token_accuracy": 0.6501599550247192, + "num_tokens": 104690100.0, + "step": 4122 + }, + { + "epoch": 0.4527783878761256, + "grad_norm": 2.071521520614624, + "learning_rate": 1e-06, + "loss": 0.9612, + "mean_token_accuracy": 0.7077213525772095, + "num_tokens": 104717862.0, + "step": 4123 + }, + { + "epoch": 0.45288820557873927, + "grad_norm": 2.494675397872925, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7158386707305908, + "num_tokens": 104737542.0, + "step": 4124 + }, + { + "epoch": 0.45299802328135297, + "grad_norm": 2.0672833919525146, + "learning_rate": 1e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.7051010727882385, + "num_tokens": 104765464.0, + "step": 4125 + }, + { + "epoch": 0.4531078409839666, + "grad_norm": 2.3410439491271973, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7056611776351929, + "num_tokens": 104790112.0, + "step": 4126 + }, + { + "epoch": 0.45321765868658026, + "grad_norm": 2.402571439743042, + "learning_rate": 1e-06, + "loss": 0.96, + "mean_token_accuracy": 0.7129920125007629, + "num_tokens": 104812663.0, + "step": 4127 + }, + { + "epoch": 0.45332747638919396, + "grad_norm": 2.257563591003418, + "learning_rate": 1e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.7057636380195618, + "num_tokens": 104837524.0, + "step": 4128 + }, + { + "epoch": 0.4534372940918076, + "grad_norm": 2.2054495811462402, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.6866426467895508, + "num_tokens": 104862988.0, + "step": 4129 + }, + { + "epoch": 0.45354711179442125, + "grad_norm": 2.3885817527770996, + "learning_rate": 1e-06, + "loss": 0.8986, + "mean_token_accuracy": 0.7177908420562744, + "num_tokens": 104885422.0, + "step": 4130 + }, + { + "epoch": 0.4536569294970349, + "grad_norm": 2.267430305480957, + "learning_rate": 1e-06, + "loss": 1.0295, + "mean_token_accuracy": 0.6962426900863647, + "num_tokens": 104908913.0, + "step": 4131 + }, + { + "epoch": 0.4537667471996486, + "grad_norm": 2.2116057872772217, + "learning_rate": 1e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.7032036185264587, + "num_tokens": 104935378.0, + "step": 4132 + }, + { + "epoch": 0.45387656490226225, + "grad_norm": 2.5524818897247314, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.7012125849723816, + "num_tokens": 104955378.0, + "step": 4133 + }, + { + "epoch": 0.4539863826048759, + "grad_norm": 2.111234188079834, + "learning_rate": 1e-06, + "loss": 1.036, + "mean_token_accuracy": 0.6861701011657715, + "num_tokens": 104983414.0, + "step": 4134 + }, + { + "epoch": 0.4540962003074896, + "grad_norm": 2.2662477493286133, + "learning_rate": 1e-06, + "loss": 0.9808, + "mean_token_accuracy": 0.703216552734375, + "num_tokens": 105009201.0, + "step": 4135 + }, + { + "epoch": 0.45420601801010324, + "grad_norm": 2.3503763675689697, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7096494436264038, + "num_tokens": 105032553.0, + "step": 4136 + }, + { + "epoch": 0.4543158357127169, + "grad_norm": 2.284085988998413, + "learning_rate": 1e-06, + "loss": 1.093, + "mean_token_accuracy": 0.6801068782806396, + "num_tokens": 105057858.0, + "step": 4137 + }, + { + "epoch": 0.45442565341533053, + "grad_norm": 2.456454277038574, + "learning_rate": 1e-06, + "loss": 0.8724, + "mean_token_accuracy": 0.7243863344192505, + "num_tokens": 105077023.0, + "step": 4138 + }, + { + "epoch": 0.45453547111794423, + "grad_norm": 2.4953677654266357, + "learning_rate": 1e-06, + "loss": 0.9487, + "mean_token_accuracy": 0.7118386030197144, + "num_tokens": 105098351.0, + "step": 4139 + }, + { + "epoch": 0.4546452888205579, + "grad_norm": 2.1961193084716797, + "learning_rate": 1e-06, + "loss": 1.022, + "mean_token_accuracy": 0.6917747855186462, + "num_tokens": 105125059.0, + "step": 4140 + }, + { + "epoch": 0.4547551065231715, + "grad_norm": 2.0072073936462402, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.6933034062385559, + "num_tokens": 105154481.0, + "step": 4141 + }, + { + "epoch": 0.45486492422578517, + "grad_norm": 2.2215898036956787, + "learning_rate": 1e-06, + "loss": 1.0163, + "mean_token_accuracy": 0.6961566209793091, + "num_tokens": 105177908.0, + "step": 4142 + }, + { + "epoch": 0.4549747419283989, + "grad_norm": 2.265256881713867, + "learning_rate": 1e-06, + "loss": 0.8849, + "mean_token_accuracy": 0.7261512279510498, + "num_tokens": 105200702.0, + "step": 4143 + }, + { + "epoch": 0.4550845596310125, + "grad_norm": 2.454488754272461, + "learning_rate": 1e-06, + "loss": 1.0448, + "mean_token_accuracy": 0.6885197758674622, + "num_tokens": 105223500.0, + "step": 4144 + }, + { + "epoch": 0.45519437733362617, + "grad_norm": 2.1537787914276123, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.7061126232147217, + "num_tokens": 105249109.0, + "step": 4145 + }, + { + "epoch": 0.45530419503623987, + "grad_norm": 2.484055280685425, + "learning_rate": 1e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7155319452285767, + "num_tokens": 105269168.0, + "step": 4146 + }, + { + "epoch": 0.4554140127388535, + "grad_norm": 2.0368731021881104, + "learning_rate": 1e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.6927727460861206, + "num_tokens": 105300125.0, + "step": 4147 + }, + { + "epoch": 0.45552383044146716, + "grad_norm": 2.042351484298706, + "learning_rate": 1e-06, + "loss": 1.002, + "mean_token_accuracy": 0.7016197443008423, + "num_tokens": 105329462.0, + "step": 4148 + }, + { + "epoch": 0.4556336481440808, + "grad_norm": 2.5128893852233887, + "learning_rate": 1e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.712024986743927, + "num_tokens": 105348604.0, + "step": 4149 + }, + { + "epoch": 0.4557434658466945, + "grad_norm": 2.1258606910705566, + "learning_rate": 1e-06, + "loss": 1.0235, + "mean_token_accuracy": 0.701671838760376, + "num_tokens": 105376291.0, + "step": 4150 + }, + { + "epoch": 0.45585328354930815, + "grad_norm": 2.4098520278930664, + "learning_rate": 1e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.7159063816070557, + "num_tokens": 105397557.0, + "step": 4151 + }, + { + "epoch": 0.4559631012519218, + "grad_norm": 2.2226173877716064, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.7031142711639404, + "num_tokens": 105422688.0, + "step": 4152 + }, + { + "epoch": 0.45607291895453544, + "grad_norm": 2.320394277572632, + "learning_rate": 1e-06, + "loss": 0.9203, + "mean_token_accuracy": 0.724426805973053, + "num_tokens": 105444876.0, + "step": 4153 + }, + { + "epoch": 0.45618273665714915, + "grad_norm": 2.2843661308288574, + "learning_rate": 1e-06, + "loss": 0.9532, + "mean_token_accuracy": 0.7105104327201843, + "num_tokens": 105469034.0, + "step": 4154 + }, + { + "epoch": 0.4562925543597628, + "grad_norm": 2.04790997505188, + "learning_rate": 1e-06, + "loss": 0.9254, + "mean_token_accuracy": 0.7155634760856628, + "num_tokens": 105497326.0, + "step": 4155 + }, + { + "epoch": 0.45640237206237644, + "grad_norm": 2.049461841583252, + "learning_rate": 1e-06, + "loss": 0.9254, + "mean_token_accuracy": 0.7181427478790283, + "num_tokens": 105526829.0, + "step": 4156 + }, + { + "epoch": 0.45651218976499014, + "grad_norm": 2.2461016178131104, + "learning_rate": 1e-06, + "loss": 0.855, + "mean_token_accuracy": 0.7350341081619263, + "num_tokens": 105549569.0, + "step": 4157 + }, + { + "epoch": 0.4566220074676038, + "grad_norm": 2.1344242095947266, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.7201443314552307, + "num_tokens": 105575460.0, + "step": 4158 + }, + { + "epoch": 0.45673182517021743, + "grad_norm": 2.4395055770874023, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7122596502304077, + "num_tokens": 105594669.0, + "step": 4159 + }, + { + "epoch": 0.4568416428728311, + "grad_norm": 2.131707191467285, + "learning_rate": 1e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.7073724865913391, + "num_tokens": 105620853.0, + "step": 4160 + }, + { + "epoch": 0.4569514605754448, + "grad_norm": 1.7375706434249878, + "learning_rate": 1e-06, + "loss": 0.898, + "mean_token_accuracy": 0.7200666666030884, + "num_tokens": 105655563.0, + "step": 4161 + }, + { + "epoch": 0.4570612782780584, + "grad_norm": 2.0865681171417236, + "learning_rate": 1e-06, + "loss": 0.9933, + "mean_token_accuracy": 0.698788046836853, + "num_tokens": 105681748.0, + "step": 4162 + }, + { + "epoch": 0.45717109598067207, + "grad_norm": 2.04286789894104, + "learning_rate": 1e-06, + "loss": 0.9457, + "mean_token_accuracy": 0.7138830423355103, + "num_tokens": 105708782.0, + "step": 4163 + }, + { + "epoch": 0.4572809136832858, + "grad_norm": 2.417308807373047, + "learning_rate": 1e-06, + "loss": 0.901, + "mean_token_accuracy": 0.7275981903076172, + "num_tokens": 105730927.0, + "step": 4164 + }, + { + "epoch": 0.4573907313858994, + "grad_norm": 2.489880323410034, + "learning_rate": 1e-06, + "loss": 0.969, + "mean_token_accuracy": 0.7072774767875671, + "num_tokens": 105751711.0, + "step": 4165 + }, + { + "epoch": 0.45750054908851306, + "grad_norm": 2.0216667652130127, + "learning_rate": 1e-06, + "loss": 1.0832, + "mean_token_accuracy": 0.6798146367073059, + "num_tokens": 105782236.0, + "step": 4166 + }, + { + "epoch": 0.4576103667911267, + "grad_norm": 2.1632156372070312, + "learning_rate": 1e-06, + "loss": 1.0134, + "mean_token_accuracy": 0.7060730457305908, + "num_tokens": 105807828.0, + "step": 4167 + }, + { + "epoch": 0.4577201844937404, + "grad_norm": 2.058269739151001, + "learning_rate": 1e-06, + "loss": 0.9974, + "mean_token_accuracy": 0.7004320025444031, + "num_tokens": 105837018.0, + "step": 4168 + }, + { + "epoch": 0.45783000219635406, + "grad_norm": 2.2826569080352783, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.699550986289978, + "num_tokens": 105861447.0, + "step": 4169 + }, + { + "epoch": 0.4579398198989677, + "grad_norm": 2.4690489768981934, + "learning_rate": 1e-06, + "loss": 0.81, + "mean_token_accuracy": 0.7413848638534546, + "num_tokens": 105881388.0, + "step": 4170 + }, + { + "epoch": 0.45804963760158135, + "grad_norm": 2.4696905612945557, + "learning_rate": 1e-06, + "loss": 0.8627, + "mean_token_accuracy": 0.7232950925827026, + "num_tokens": 105901063.0, + "step": 4171 + }, + { + "epoch": 0.45815945530419505, + "grad_norm": 2.3692922592163086, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7178217172622681, + "num_tokens": 105922082.0, + "step": 4172 + }, + { + "epoch": 0.4582692730068087, + "grad_norm": 2.2820606231689453, + "learning_rate": 1e-06, + "loss": 1.0355, + "mean_token_accuracy": 0.6863856315612793, + "num_tokens": 105946544.0, + "step": 4173 + }, + { + "epoch": 0.45837909070942234, + "grad_norm": 2.023527145385742, + "learning_rate": 1e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.7043692469596863, + "num_tokens": 105975485.0, + "step": 4174 + }, + { + "epoch": 0.45848890841203604, + "grad_norm": 2.1603901386260986, + "learning_rate": 1e-06, + "loss": 1.0021, + "mean_token_accuracy": 0.7009044289588928, + "num_tokens": 106002526.0, + "step": 4175 + }, + { + "epoch": 0.4585987261146497, + "grad_norm": 2.2780745029449463, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.6987616419792175, + "num_tokens": 106029218.0, + "step": 4176 + }, + { + "epoch": 0.45870854381726334, + "grad_norm": 2.3747105598449707, + "learning_rate": 1e-06, + "loss": 1.0485, + "mean_token_accuracy": 0.6941677331924438, + "num_tokens": 106051927.0, + "step": 4177 + }, + { + "epoch": 0.458818361519877, + "grad_norm": 2.2330033779144287, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.7103716135025024, + "num_tokens": 106075577.0, + "step": 4178 + }, + { + "epoch": 0.4589281792224907, + "grad_norm": 2.172891855239868, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7028495669364929, + "num_tokens": 106102895.0, + "step": 4179 + }, + { + "epoch": 0.45903799692510433, + "grad_norm": 2.270979881286621, + "learning_rate": 1e-06, + "loss": 0.9025, + "mean_token_accuracy": 0.7253510355949402, + "num_tokens": 106125006.0, + "step": 4180 + }, + { + "epoch": 0.459147814627718, + "grad_norm": 2.3936004638671875, + "learning_rate": 1e-06, + "loss": 0.966, + "mean_token_accuracy": 0.7094821929931641, + "num_tokens": 106147315.0, + "step": 4181 + }, + { + "epoch": 0.4592576323303316, + "grad_norm": 2.0671634674072266, + "learning_rate": 1e-06, + "loss": 1.0382, + "mean_token_accuracy": 0.6934406161308289, + "num_tokens": 106175591.0, + "step": 4182 + }, + { + "epoch": 0.4593674500329453, + "grad_norm": 2.3241498470306396, + "learning_rate": 1e-06, + "loss": 0.9912, + "mean_token_accuracy": 0.6992496252059937, + "num_tokens": 106199298.0, + "step": 4183 + }, + { + "epoch": 0.45947726773555897, + "grad_norm": 2.299760103225708, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.7018420696258545, + "num_tokens": 106224186.0, + "step": 4184 + }, + { + "epoch": 0.4595870854381726, + "grad_norm": 2.2205140590667725, + "learning_rate": 1e-06, + "loss": 0.9614, + "mean_token_accuracy": 0.7147631645202637, + "num_tokens": 106250822.0, + "step": 4185 + }, + { + "epoch": 0.4596969031407863, + "grad_norm": 2.375922441482544, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.7022584080696106, + "num_tokens": 106274403.0, + "step": 4186 + }, + { + "epoch": 0.45980672084339996, + "grad_norm": 2.188096761703491, + "learning_rate": 1e-06, + "loss": 1.0373, + "mean_token_accuracy": 0.6868269443511963, + "num_tokens": 106300407.0, + "step": 4187 + }, + { + "epoch": 0.4599165385460136, + "grad_norm": 2.1181373596191406, + "learning_rate": 1e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.7059856653213501, + "num_tokens": 106330909.0, + "step": 4188 + }, + { + "epoch": 0.46002635624862726, + "grad_norm": 2.4860892295837402, + "learning_rate": 1e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.7066841125488281, + "num_tokens": 106353049.0, + "step": 4189 + }, + { + "epoch": 0.46013617395124096, + "grad_norm": 2.3380396366119385, + "learning_rate": 1e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.7016117572784424, + "num_tokens": 106377475.0, + "step": 4190 + }, + { + "epoch": 0.4602459916538546, + "grad_norm": 2.099719762802124, + "learning_rate": 1e-06, + "loss": 0.991, + "mean_token_accuracy": 0.6984045505523682, + "num_tokens": 106405946.0, + "step": 4191 + }, + { + "epoch": 0.46035580935646825, + "grad_norm": 2.2230889797210693, + "learning_rate": 1e-06, + "loss": 1.0198, + "mean_token_accuracy": 0.6921844482421875, + "num_tokens": 106433714.0, + "step": 4192 + }, + { + "epoch": 0.46046562705908195, + "grad_norm": 2.363966941833496, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7145536541938782, + "num_tokens": 106457103.0, + "step": 4193 + }, + { + "epoch": 0.4605754447616956, + "grad_norm": 2.2825136184692383, + "learning_rate": 1e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.7350254058837891, + "num_tokens": 106480062.0, + "step": 4194 + }, + { + "epoch": 0.46068526246430924, + "grad_norm": 1.950549840927124, + "learning_rate": 1e-06, + "loss": 1.0473, + "mean_token_accuracy": 0.6780222654342651, + "num_tokens": 106511360.0, + "step": 4195 + }, + { + "epoch": 0.4607950801669229, + "grad_norm": 2.2685093879699707, + "learning_rate": 1e-06, + "loss": 1.0059, + "mean_token_accuracy": 0.6889079213142395, + "num_tokens": 106535826.0, + "step": 4196 + }, + { + "epoch": 0.4609048978695366, + "grad_norm": 2.5025229454040527, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.7115668654441833, + "num_tokens": 106557659.0, + "step": 4197 + }, + { + "epoch": 0.46101471557215024, + "grad_norm": 2.2275688648223877, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7074620127677917, + "num_tokens": 106582729.0, + "step": 4198 + }, + { + "epoch": 0.4611245332747639, + "grad_norm": 2.3336703777313232, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.6995420455932617, + "num_tokens": 106606187.0, + "step": 4199 + }, + { + "epoch": 0.4612343509773775, + "grad_norm": 2.3081116676330566, + "learning_rate": 1e-06, + "loss": 0.9213, + "mean_token_accuracy": 0.7147164344787598, + "num_tokens": 106628756.0, + "step": 4200 + }, + { + "epoch": 0.46134416867999123, + "grad_norm": 1.930449366569519, + "learning_rate": 1e-06, + "loss": 1.0414, + "mean_token_accuracy": 0.6847153306007385, + "num_tokens": 106662492.0, + "step": 4201 + }, + { + "epoch": 0.4614539863826049, + "grad_norm": 1.8145517110824585, + "learning_rate": 1e-06, + "loss": 0.9976, + "mean_token_accuracy": 0.6877530813217163, + "num_tokens": 106698519.0, + "step": 4202 + }, + { + "epoch": 0.4615638040852185, + "grad_norm": 2.353187084197998, + "learning_rate": 1e-06, + "loss": 1.0979, + "mean_token_accuracy": 0.6670486927032471, + "num_tokens": 106722004.0, + "step": 4203 + }, + { + "epoch": 0.4616736217878322, + "grad_norm": 1.8587586879730225, + "learning_rate": 1e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.7155467867851257, + "num_tokens": 106754097.0, + "step": 4204 + }, + { + "epoch": 0.46178343949044587, + "grad_norm": 2.064145088195801, + "learning_rate": 1e-06, + "loss": 1.0487, + "mean_token_accuracy": 0.6847622394561768, + "num_tokens": 106785260.0, + "step": 4205 + }, + { + "epoch": 0.4618932571930595, + "grad_norm": 2.421220541000366, + "learning_rate": 1e-06, + "loss": 0.9787, + "mean_token_accuracy": 0.7026075720787048, + "num_tokens": 106808878.0, + "step": 4206 + }, + { + "epoch": 0.46200307489567316, + "grad_norm": 2.3953871726989746, + "learning_rate": 1e-06, + "loss": 0.99, + "mean_token_accuracy": 0.6985794901847839, + "num_tokens": 106832064.0, + "step": 4207 + }, + { + "epoch": 0.46211289259828686, + "grad_norm": 2.081418514251709, + "learning_rate": 1e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.7162265777587891, + "num_tokens": 106860394.0, + "step": 4208 + }, + { + "epoch": 0.4622227103009005, + "grad_norm": 2.0563511848449707, + "learning_rate": 1e-06, + "loss": 0.9937, + "mean_token_accuracy": 0.7011567950248718, + "num_tokens": 106889060.0, + "step": 4209 + }, + { + "epoch": 0.46233252800351415, + "grad_norm": 2.1324312686920166, + "learning_rate": 1e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.7019810676574707, + "num_tokens": 106914224.0, + "step": 4210 + }, + { + "epoch": 0.46244234570612786, + "grad_norm": 2.3555142879486084, + "learning_rate": 1e-06, + "loss": 1.1034, + "mean_token_accuracy": 0.6743361353874207, + "num_tokens": 106937137.0, + "step": 4211 + }, + { + "epoch": 0.4625521634087415, + "grad_norm": 2.114262580871582, + "learning_rate": 1e-06, + "loss": 1.014, + "mean_token_accuracy": 0.6946396231651306, + "num_tokens": 106963496.0, + "step": 4212 + }, + { + "epoch": 0.46266198111135515, + "grad_norm": 2.2460408210754395, + "learning_rate": 1e-06, + "loss": 0.935, + "mean_token_accuracy": 0.7145706415176392, + "num_tokens": 106988430.0, + "step": 4213 + }, + { + "epoch": 0.4627717988139688, + "grad_norm": 2.1741604804992676, + "learning_rate": 1e-06, + "loss": 0.8921, + "mean_token_accuracy": 0.7258621454238892, + "num_tokens": 107012923.0, + "step": 4214 + }, + { + "epoch": 0.4628816165165825, + "grad_norm": 2.3047831058502197, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7168910503387451, + "num_tokens": 107038115.0, + "step": 4215 + }, + { + "epoch": 0.46299143421919614, + "grad_norm": 2.0115582942962646, + "learning_rate": 1e-06, + "loss": 0.85, + "mean_token_accuracy": 0.7337627410888672, + "num_tokens": 107066127.0, + "step": 4216 + }, + { + "epoch": 0.4631012519218098, + "grad_norm": 2.2132701873779297, + "learning_rate": 1e-06, + "loss": 0.9378, + "mean_token_accuracy": 0.715243935585022, + "num_tokens": 107090297.0, + "step": 4217 + }, + { + "epoch": 0.46321106962442343, + "grad_norm": 2.41290545463562, + "learning_rate": 1e-06, + "loss": 0.8881, + "mean_token_accuracy": 0.7228835821151733, + "num_tokens": 107113846.0, + "step": 4218 + }, + { + "epoch": 0.46332088732703713, + "grad_norm": 2.065338373184204, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.701938807964325, + "num_tokens": 107143281.0, + "step": 4219 + }, + { + "epoch": 0.4634307050296508, + "grad_norm": 1.956041693687439, + "learning_rate": 1e-06, + "loss": 0.9884, + "mean_token_accuracy": 0.7024095058441162, + "num_tokens": 107176470.0, + "step": 4220 + }, + { + "epoch": 0.4635405227322644, + "grad_norm": 2.3408384323120117, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.7056434154510498, + "num_tokens": 107199696.0, + "step": 4221 + }, + { + "epoch": 0.4636503404348781, + "grad_norm": 2.3415918350219727, + "learning_rate": 1e-06, + "loss": 0.9219, + "mean_token_accuracy": 0.7162052392959595, + "num_tokens": 107224500.0, + "step": 4222 + }, + { + "epoch": 0.4637601581374918, + "grad_norm": 2.036548614501953, + "learning_rate": 1e-06, + "loss": 1.0116, + "mean_token_accuracy": 0.695274829864502, + "num_tokens": 107254395.0, + "step": 4223 + }, + { + "epoch": 0.4638699758401054, + "grad_norm": 2.0669314861297607, + "learning_rate": 1e-06, + "loss": 1.1251, + "mean_token_accuracy": 0.6693427562713623, + "num_tokens": 107284243.0, + "step": 4224 + }, + { + "epoch": 0.46397979354271907, + "grad_norm": 2.2284507751464844, + "learning_rate": 1e-06, + "loss": 0.9847, + "mean_token_accuracy": 0.7009062767028809, + "num_tokens": 107309890.0, + "step": 4225 + }, + { + "epoch": 0.46408961124533277, + "grad_norm": 2.12381649017334, + "learning_rate": 1e-06, + "loss": 0.9697, + "mean_token_accuracy": 0.7006276845932007, + "num_tokens": 107339506.0, + "step": 4226 + }, + { + "epoch": 0.4641994289479464, + "grad_norm": 2.159822940826416, + "learning_rate": 1e-06, + "loss": 0.8548, + "mean_token_accuracy": 0.7320551872253418, + "num_tokens": 107364735.0, + "step": 4227 + }, + { + "epoch": 0.46430924665056006, + "grad_norm": 2.404785394668579, + "learning_rate": 1e-06, + "loss": 1.0514, + "mean_token_accuracy": 0.6880201101303101, + "num_tokens": 107386271.0, + "step": 4228 + }, + { + "epoch": 0.4644190643531737, + "grad_norm": 2.275015115737915, + "learning_rate": 1e-06, + "loss": 0.9997, + "mean_token_accuracy": 0.6974337697029114, + "num_tokens": 107410378.0, + "step": 4229 + }, + { + "epoch": 0.4645288820557874, + "grad_norm": 2.006417751312256, + "learning_rate": 1e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.6976913213729858, + "num_tokens": 107439711.0, + "step": 4230 + }, + { + "epoch": 0.46463869975840105, + "grad_norm": 2.4273664951324463, + "learning_rate": 1e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.7210087180137634, + "num_tokens": 107460034.0, + "step": 4231 + }, + { + "epoch": 0.4647485174610147, + "grad_norm": 2.3561208248138428, + "learning_rate": 1e-06, + "loss": 0.9694, + "mean_token_accuracy": 0.707595705986023, + "num_tokens": 107483655.0, + "step": 4232 + }, + { + "epoch": 0.4648583351636284, + "grad_norm": 2.1058831214904785, + "learning_rate": 1e-06, + "loss": 0.922, + "mean_token_accuracy": 0.7205517888069153, + "num_tokens": 107510889.0, + "step": 4233 + }, + { + "epoch": 0.46496815286624205, + "grad_norm": 2.1512513160705566, + "learning_rate": 1e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.7250473499298096, + "num_tokens": 107535249.0, + "step": 4234 + }, + { + "epoch": 0.4650779705688557, + "grad_norm": 2.0732696056365967, + "learning_rate": 1e-06, + "loss": 1.1267, + "mean_token_accuracy": 0.6721270680427551, + "num_tokens": 107565678.0, + "step": 4235 + }, + { + "epoch": 0.46518778827146934, + "grad_norm": 1.982973575592041, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7242899537086487, + "num_tokens": 107594448.0, + "step": 4236 + }, + { + "epoch": 0.46529760597408304, + "grad_norm": 2.3511581420898438, + "learning_rate": 1e-06, + "loss": 0.9989, + "mean_token_accuracy": 0.6946845054626465, + "num_tokens": 107617996.0, + "step": 4237 + }, + { + "epoch": 0.4654074236766967, + "grad_norm": 2.101093292236328, + "learning_rate": 1e-06, + "loss": 1.0547, + "mean_token_accuracy": 0.6804594397544861, + "num_tokens": 107646135.0, + "step": 4238 + }, + { + "epoch": 0.46551724137931033, + "grad_norm": 2.140538215637207, + "learning_rate": 1e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.7072547078132629, + "num_tokens": 107673813.0, + "step": 4239 + }, + { + "epoch": 0.46562705908192403, + "grad_norm": 2.2694976329803467, + "learning_rate": 1e-06, + "loss": 0.997, + "mean_token_accuracy": 0.7032113075256348, + "num_tokens": 107699068.0, + "step": 4240 + }, + { + "epoch": 0.4657368767845377, + "grad_norm": 2.194387674331665, + "learning_rate": 1e-06, + "loss": 1.0111, + "mean_token_accuracy": 0.7038788199424744, + "num_tokens": 107724479.0, + "step": 4241 + }, + { + "epoch": 0.4658466944871513, + "grad_norm": 2.277223587036133, + "learning_rate": 1e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.7211564779281616, + "num_tokens": 107746581.0, + "step": 4242 + }, + { + "epoch": 0.46595651218976497, + "grad_norm": 2.3408491611480713, + "learning_rate": 1e-06, + "loss": 1.0028, + "mean_token_accuracy": 0.7017540335655212, + "num_tokens": 107769964.0, + "step": 4243 + }, + { + "epoch": 0.4660663298923787, + "grad_norm": 2.1926803588867188, + "learning_rate": 1e-06, + "loss": 0.9933, + "mean_token_accuracy": 0.6971865892410278, + "num_tokens": 107796636.0, + "step": 4244 + }, + { + "epoch": 0.4661761475949923, + "grad_norm": 1.9853370189666748, + "learning_rate": 1e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7074332237243652, + "num_tokens": 107826006.0, + "step": 4245 + }, + { + "epoch": 0.46628596529760596, + "grad_norm": 2.741058111190796, + "learning_rate": 1e-06, + "loss": 0.9657, + "mean_token_accuracy": 0.7068515419960022, + "num_tokens": 107844497.0, + "step": 4246 + }, + { + "epoch": 0.4663957830002196, + "grad_norm": 1.926510214805603, + "learning_rate": 1e-06, + "loss": 0.9743, + "mean_token_accuracy": 0.7030117511749268, + "num_tokens": 107874573.0, + "step": 4247 + }, + { + "epoch": 0.4665056007028333, + "grad_norm": 2.5433552265167236, + "learning_rate": 1e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.7038042545318604, + "num_tokens": 107896307.0, + "step": 4248 + }, + { + "epoch": 0.46661541840544696, + "grad_norm": 2.1586666107177734, + "learning_rate": 1e-06, + "loss": 1.0269, + "mean_token_accuracy": 0.6908633708953857, + "num_tokens": 107924792.0, + "step": 4249 + }, + { + "epoch": 0.4667252361080606, + "grad_norm": 2.3791701793670654, + "learning_rate": 1e-06, + "loss": 0.949, + "mean_token_accuracy": 0.7061932682991028, + "num_tokens": 107947460.0, + "step": 4250 + }, + { + "epoch": 0.4668350538106743, + "grad_norm": 2.427591562271118, + "learning_rate": 1e-06, + "loss": 1.022, + "mean_token_accuracy": 0.6896508932113647, + "num_tokens": 107969979.0, + "step": 4251 + }, + { + "epoch": 0.46694487151328795, + "grad_norm": 2.4197707176208496, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7210556268692017, + "num_tokens": 107989840.0, + "step": 4252 + }, + { + "epoch": 0.4670546892159016, + "grad_norm": 2.110093593597412, + "learning_rate": 1e-06, + "loss": 0.9789, + "mean_token_accuracy": 0.7084153294563293, + "num_tokens": 108015985.0, + "step": 4253 + }, + { + "epoch": 0.46716450691851524, + "grad_norm": 2.228367328643799, + "learning_rate": 1e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.7158043384552002, + "num_tokens": 108038571.0, + "step": 4254 + }, + { + "epoch": 0.46727432462112894, + "grad_norm": 2.369978666305542, + "learning_rate": 1e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.712295413017273, + "num_tokens": 108060148.0, + "step": 4255 + }, + { + "epoch": 0.4673841423237426, + "grad_norm": 2.288149118423462, + "learning_rate": 1e-06, + "loss": 0.9738, + "mean_token_accuracy": 0.7226094007492065, + "num_tokens": 108084462.0, + "step": 4256 + }, + { + "epoch": 0.46749396002635624, + "grad_norm": 2.084747314453125, + "learning_rate": 1e-06, + "loss": 1.0301, + "mean_token_accuracy": 0.6852160096168518, + "num_tokens": 108112991.0, + "step": 4257 + }, + { + "epoch": 0.4676037777289699, + "grad_norm": 2.380596876144409, + "learning_rate": 1e-06, + "loss": 0.9274, + "mean_token_accuracy": 0.7115694284439087, + "num_tokens": 108135281.0, + "step": 4258 + }, + { + "epoch": 0.4677135954315836, + "grad_norm": 2.2692129611968994, + "learning_rate": 1e-06, + "loss": 0.9995, + "mean_token_accuracy": 0.6982308030128479, + "num_tokens": 108160099.0, + "step": 4259 + }, + { + "epoch": 0.46782341313419723, + "grad_norm": 2.309626340866089, + "learning_rate": 1e-06, + "loss": 0.9719, + "mean_token_accuracy": 0.7098742723464966, + "num_tokens": 108182330.0, + "step": 4260 + }, + { + "epoch": 0.4679332308368109, + "grad_norm": 2.0918726921081543, + "learning_rate": 1e-06, + "loss": 1.0215, + "mean_token_accuracy": 0.6942198276519775, + "num_tokens": 108209705.0, + "step": 4261 + }, + { + "epoch": 0.4680430485394246, + "grad_norm": 2.4442672729492188, + "learning_rate": 1e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.7029886245727539, + "num_tokens": 108230295.0, + "step": 4262 + }, + { + "epoch": 0.4681528662420382, + "grad_norm": 2.5346670150756836, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.7038134336471558, + "num_tokens": 108250084.0, + "step": 4263 + }, + { + "epoch": 0.46826268394465187, + "grad_norm": 2.28078031539917, + "learning_rate": 1e-06, + "loss": 0.953, + "mean_token_accuracy": 0.7020873427391052, + "num_tokens": 108274040.0, + "step": 4264 + }, + { + "epoch": 0.4683725016472655, + "grad_norm": 2.242781400680542, + "learning_rate": 1e-06, + "loss": 0.9446, + "mean_token_accuracy": 0.7032240629196167, + "num_tokens": 108299867.0, + "step": 4265 + }, + { + "epoch": 0.4684823193498792, + "grad_norm": 2.235666513442993, + "learning_rate": 1e-06, + "loss": 0.954, + "mean_token_accuracy": 0.7162165641784668, + "num_tokens": 108324777.0, + "step": 4266 + }, + { + "epoch": 0.46859213705249286, + "grad_norm": 2.5529637336730957, + "learning_rate": 1e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.7280070185661316, + "num_tokens": 108344442.0, + "step": 4267 + }, + { + "epoch": 0.4687019547551065, + "grad_norm": 2.5102176666259766, + "learning_rate": 1e-06, + "loss": 0.9236, + "mean_token_accuracy": 0.7139612436294556, + "num_tokens": 108364261.0, + "step": 4268 + }, + { + "epoch": 0.4688117724577202, + "grad_norm": 2.2768633365631104, + "learning_rate": 1e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.7072117328643799, + "num_tokens": 108388020.0, + "step": 4269 + }, + { + "epoch": 0.46892159016033386, + "grad_norm": 2.3186233043670654, + "learning_rate": 1e-06, + "loss": 0.9506, + "mean_token_accuracy": 0.7084482908248901, + "num_tokens": 108410818.0, + "step": 4270 + }, + { + "epoch": 0.4690314078629475, + "grad_norm": 2.245241641998291, + "learning_rate": 1e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.705225944519043, + "num_tokens": 108435269.0, + "step": 4271 + }, + { + "epoch": 0.46914122556556115, + "grad_norm": 2.209026575088501, + "learning_rate": 1e-06, + "loss": 1.0159, + "mean_token_accuracy": 0.6897248029708862, + "num_tokens": 108460375.0, + "step": 4272 + }, + { + "epoch": 0.46925104326817485, + "grad_norm": 2.0515308380126953, + "learning_rate": 1e-06, + "loss": 0.9247, + "mean_token_accuracy": 0.7143676280975342, + "num_tokens": 108487779.0, + "step": 4273 + }, + { + "epoch": 0.4693608609707885, + "grad_norm": 2.2616193294525146, + "learning_rate": 1e-06, + "loss": 0.9766, + "mean_token_accuracy": 0.7055548429489136, + "num_tokens": 108511520.0, + "step": 4274 + }, + { + "epoch": 0.46947067867340214, + "grad_norm": 2.0140249729156494, + "learning_rate": 1e-06, + "loss": 0.9673, + "mean_token_accuracy": 0.7105681300163269, + "num_tokens": 108538352.0, + "step": 4275 + }, + { + "epoch": 0.4695804963760158, + "grad_norm": 2.2421092987060547, + "learning_rate": 1e-06, + "loss": 1.0331, + "mean_token_accuracy": 0.6992306709289551, + "num_tokens": 108563436.0, + "step": 4276 + }, + { + "epoch": 0.4696903140786295, + "grad_norm": 2.49574613571167, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.70062655210495, + "num_tokens": 108584172.0, + "step": 4277 + }, + { + "epoch": 0.46980013178124314, + "grad_norm": 2.090639352798462, + "learning_rate": 1e-06, + "loss": 1.0759, + "mean_token_accuracy": 0.6776832342147827, + "num_tokens": 108613626.0, + "step": 4278 + }, + { + "epoch": 0.4699099494838568, + "grad_norm": 2.279841899871826, + "learning_rate": 1e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.7081486582756042, + "num_tokens": 108638119.0, + "step": 4279 + }, + { + "epoch": 0.4700197671864705, + "grad_norm": 2.0190868377685547, + "learning_rate": 1e-06, + "loss": 1.0517, + "mean_token_accuracy": 0.6823251247406006, + "num_tokens": 108666100.0, + "step": 4280 + }, + { + "epoch": 0.47012958488908413, + "grad_norm": 2.4207916259765625, + "learning_rate": 1e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.7006575465202332, + "num_tokens": 108687752.0, + "step": 4281 + }, + { + "epoch": 0.4702394025916978, + "grad_norm": 2.010784149169922, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.692419171333313, + "num_tokens": 108717090.0, + "step": 4282 + }, + { + "epoch": 0.4703492202943114, + "grad_norm": 2.6239283084869385, + "learning_rate": 1e-06, + "loss": 0.9694, + "mean_token_accuracy": 0.7376039624214172, + "num_tokens": 108738492.0, + "step": 4283 + }, + { + "epoch": 0.4704590379969251, + "grad_norm": 2.1867895126342773, + "learning_rate": 1e-06, + "loss": 0.948, + "mean_token_accuracy": 0.7101825475692749, + "num_tokens": 108764341.0, + "step": 4284 + }, + { + "epoch": 0.47056885569953877, + "grad_norm": 2.268507480621338, + "learning_rate": 1e-06, + "loss": 1.0065, + "mean_token_accuracy": 0.6897419691085815, + "num_tokens": 108788111.0, + "step": 4285 + }, + { + "epoch": 0.4706786734021524, + "grad_norm": 2.2936129570007324, + "learning_rate": 1e-06, + "loss": 1.029, + "mean_token_accuracy": 0.690380871295929, + "num_tokens": 108814643.0, + "step": 4286 + }, + { + "epoch": 0.4707884911047661, + "grad_norm": 2.320977210998535, + "learning_rate": 1e-06, + "loss": 1.0221, + "mean_token_accuracy": 0.6911643743515015, + "num_tokens": 108839734.0, + "step": 4287 + }, + { + "epoch": 0.47089830880737976, + "grad_norm": 2.173959970474243, + "learning_rate": 1e-06, + "loss": 1.0604, + "mean_token_accuracy": 0.6824942231178284, + "num_tokens": 108868490.0, + "step": 4288 + }, + { + "epoch": 0.4710081265099934, + "grad_norm": 1.844190239906311, + "learning_rate": 1e-06, + "loss": 0.9942, + "mean_token_accuracy": 0.6980084180831909, + "num_tokens": 108902126.0, + "step": 4289 + }, + { + "epoch": 0.47111794421260705, + "grad_norm": 2.323918581008911, + "learning_rate": 1e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.697395920753479, + "num_tokens": 108928555.0, + "step": 4290 + }, + { + "epoch": 0.47122776191522076, + "grad_norm": 2.3858206272125244, + "learning_rate": 1e-06, + "loss": 0.932, + "mean_token_accuracy": 0.7127431631088257, + "num_tokens": 108949512.0, + "step": 4291 + }, + { + "epoch": 0.4713375796178344, + "grad_norm": 1.9638981819152832, + "learning_rate": 1e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.6958875060081482, + "num_tokens": 108981709.0, + "step": 4292 + }, + { + "epoch": 0.47144739732044805, + "grad_norm": 2.030526638031006, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.7016177177429199, + "num_tokens": 109014236.0, + "step": 4293 + }, + { + "epoch": 0.4715572150230617, + "grad_norm": 2.14434814453125, + "learning_rate": 1e-06, + "loss": 1.0102, + "mean_token_accuracy": 0.6975938677787781, + "num_tokens": 109040217.0, + "step": 4294 + }, + { + "epoch": 0.4716670327256754, + "grad_norm": 2.147646427154541, + "learning_rate": 1e-06, + "loss": 0.8808, + "mean_token_accuracy": 0.7205996513366699, + "num_tokens": 109064045.0, + "step": 4295 + }, + { + "epoch": 0.47177685042828904, + "grad_norm": 2.157660961151123, + "learning_rate": 1e-06, + "loss": 0.9811, + "mean_token_accuracy": 0.7023366689682007, + "num_tokens": 109092372.0, + "step": 4296 + }, + { + "epoch": 0.4718866681309027, + "grad_norm": 2.170546531677246, + "learning_rate": 1e-06, + "loss": 0.9162, + "mean_token_accuracy": 0.7295152544975281, + "num_tokens": 109119172.0, + "step": 4297 + }, + { + "epoch": 0.4719964858335164, + "grad_norm": 2.276350498199463, + "learning_rate": 1e-06, + "loss": 1.0278, + "mean_token_accuracy": 0.6932737827301025, + "num_tokens": 109143529.0, + "step": 4298 + }, + { + "epoch": 0.47210630353613003, + "grad_norm": 2.315671443939209, + "learning_rate": 1e-06, + "loss": 1.0145, + "mean_token_accuracy": 0.688633918762207, + "num_tokens": 109169853.0, + "step": 4299 + }, + { + "epoch": 0.4722161212387437, + "grad_norm": 2.0721328258514404, + "learning_rate": 1e-06, + "loss": 1.0435, + "mean_token_accuracy": 0.6897048354148865, + "num_tokens": 109199212.0, + "step": 4300 + }, + { + "epoch": 0.4723259389413573, + "grad_norm": 2.037064552307129, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7097334861755371, + "num_tokens": 109227150.0, + "step": 4301 + }, + { + "epoch": 0.472435756643971, + "grad_norm": 2.379995346069336, + "learning_rate": 1e-06, + "loss": 0.9327, + "mean_token_accuracy": 0.7158147692680359, + "num_tokens": 109249791.0, + "step": 4302 + }, + { + "epoch": 0.4725455743465847, + "grad_norm": 2.2876245975494385, + "learning_rate": 1e-06, + "loss": 0.9628, + "mean_token_accuracy": 0.7134495377540588, + "num_tokens": 109273985.0, + "step": 4303 + }, + { + "epoch": 0.4726553920491983, + "grad_norm": 2.099781036376953, + "learning_rate": 1e-06, + "loss": 1.1099, + "mean_token_accuracy": 0.6714775562286377, + "num_tokens": 109305751.0, + "step": 4304 + }, + { + "epoch": 0.47276520975181197, + "grad_norm": 2.232825517654419, + "learning_rate": 1e-06, + "loss": 1.056, + "mean_token_accuracy": 0.6813685894012451, + "num_tokens": 109333604.0, + "step": 4305 + }, + { + "epoch": 0.47287502745442567, + "grad_norm": 2.252237558364868, + "learning_rate": 1e-06, + "loss": 0.929, + "mean_token_accuracy": 0.7098862528800964, + "num_tokens": 109360363.0, + "step": 4306 + }, + { + "epoch": 0.4729848451570393, + "grad_norm": 2.275103807449341, + "learning_rate": 1e-06, + "loss": 1.016, + "mean_token_accuracy": 0.6952663064002991, + "num_tokens": 109386760.0, + "step": 4307 + }, + { + "epoch": 0.47309466285965296, + "grad_norm": 2.496345281600952, + "learning_rate": 1e-06, + "loss": 0.8818, + "mean_token_accuracy": 0.7229472398757935, + "num_tokens": 109407187.0, + "step": 4308 + }, + { + "epoch": 0.47320448056226666, + "grad_norm": 2.712995767593384, + "learning_rate": 1e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.7112299203872681, + "num_tokens": 109425453.0, + "step": 4309 + }, + { + "epoch": 0.4733142982648803, + "grad_norm": 2.1125988960266113, + "learning_rate": 1e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.7123957872390747, + "num_tokens": 109452031.0, + "step": 4310 + }, + { + "epoch": 0.47342411596749395, + "grad_norm": 2.1257803440093994, + "learning_rate": 1e-06, + "loss": 0.9687, + "mean_token_accuracy": 0.6973901391029358, + "num_tokens": 109478128.0, + "step": 4311 + }, + { + "epoch": 0.4735339336701076, + "grad_norm": 2.1135928630828857, + "learning_rate": 1e-06, + "loss": 0.9603, + "mean_token_accuracy": 0.7064763307571411, + "num_tokens": 109504036.0, + "step": 4312 + }, + { + "epoch": 0.4736437513727213, + "grad_norm": 2.185594320297241, + "learning_rate": 1e-06, + "loss": 0.9037, + "mean_token_accuracy": 0.7233409881591797, + "num_tokens": 109529792.0, + "step": 4313 + }, + { + "epoch": 0.47375356907533495, + "grad_norm": 1.8726742267608643, + "learning_rate": 1e-06, + "loss": 1.0776, + "mean_token_accuracy": 0.6857763528823853, + "num_tokens": 109562191.0, + "step": 4314 + }, + { + "epoch": 0.4738633867779486, + "grad_norm": 2.3516783714294434, + "learning_rate": 1e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.7106955051422119, + "num_tokens": 109585488.0, + "step": 4315 + }, + { + "epoch": 0.4739732044805623, + "grad_norm": 2.415064811706543, + "learning_rate": 1e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.7170006036758423, + "num_tokens": 109607760.0, + "step": 4316 + }, + { + "epoch": 0.47408302218317594, + "grad_norm": 2.099837064743042, + "learning_rate": 1e-06, + "loss": 1.0472, + "mean_token_accuracy": 0.6864368319511414, + "num_tokens": 109637248.0, + "step": 4317 + }, + { + "epoch": 0.4741928398857896, + "grad_norm": 2.051971912384033, + "learning_rate": 1e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.6967622637748718, + "num_tokens": 109663985.0, + "step": 4318 + }, + { + "epoch": 0.47430265758840323, + "grad_norm": 1.9972354173660278, + "learning_rate": 1e-06, + "loss": 1.0373, + "mean_token_accuracy": 0.689642071723938, + "num_tokens": 109695697.0, + "step": 4319 + }, + { + "epoch": 0.47441247529101693, + "grad_norm": 1.9911812543869019, + "learning_rate": 1e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7134350538253784, + "num_tokens": 109723389.0, + "step": 4320 + }, + { + "epoch": 0.4745222929936306, + "grad_norm": 2.10237979888916, + "learning_rate": 1e-06, + "loss": 1.0906, + "mean_token_accuracy": 0.6766610145568848, + "num_tokens": 109753128.0, + "step": 4321 + }, + { + "epoch": 0.4746321106962442, + "grad_norm": 2.1118454933166504, + "learning_rate": 1e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.7189128994941711, + "num_tokens": 109778976.0, + "step": 4322 + }, + { + "epoch": 0.47474192839885787, + "grad_norm": 2.2136447429656982, + "learning_rate": 1e-06, + "loss": 0.9479, + "mean_token_accuracy": 0.6997869610786438, + "num_tokens": 109803745.0, + "step": 4323 + }, + { + "epoch": 0.4748517461014716, + "grad_norm": 2.233741283416748, + "learning_rate": 1e-06, + "loss": 1.0162, + "mean_token_accuracy": 0.6926572322845459, + "num_tokens": 109828790.0, + "step": 4324 + }, + { + "epoch": 0.4749615638040852, + "grad_norm": 2.253448486328125, + "learning_rate": 1e-06, + "loss": 0.9982, + "mean_token_accuracy": 0.7030487656593323, + "num_tokens": 109854026.0, + "step": 4325 + }, + { + "epoch": 0.47507138150669886, + "grad_norm": 2.5652477741241455, + "learning_rate": 1e-06, + "loss": 0.9529, + "mean_token_accuracy": 0.7139496803283691, + "num_tokens": 109873369.0, + "step": 4326 + }, + { + "epoch": 0.47518119920931257, + "grad_norm": 2.0519182682037354, + "learning_rate": 1e-06, + "loss": 1.0317, + "mean_token_accuracy": 0.6834475994110107, + "num_tokens": 109902239.0, + "step": 4327 + }, + { + "epoch": 0.4752910169119262, + "grad_norm": 1.9878745079040527, + "learning_rate": 1e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.6949583888053894, + "num_tokens": 109935613.0, + "step": 4328 + }, + { + "epoch": 0.47540083461453986, + "grad_norm": 2.4258921146392822, + "learning_rate": 1e-06, + "loss": 1.0228, + "mean_token_accuracy": 0.6931689977645874, + "num_tokens": 109958949.0, + "step": 4329 + }, + { + "epoch": 0.4755106523171535, + "grad_norm": 2.2432916164398193, + "learning_rate": 1e-06, + "loss": 1.028, + "mean_token_accuracy": 0.6878421306610107, + "num_tokens": 109983893.0, + "step": 4330 + }, + { + "epoch": 0.4756204700197672, + "grad_norm": 2.202538251876831, + "learning_rate": 1e-06, + "loss": 1.083, + "mean_token_accuracy": 0.6777900457382202, + "num_tokens": 110010029.0, + "step": 4331 + }, + { + "epoch": 0.47573028772238085, + "grad_norm": 2.0793683528900146, + "learning_rate": 1e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7149506211280823, + "num_tokens": 110036978.0, + "step": 4332 + }, + { + "epoch": 0.4758401054249945, + "grad_norm": 2.3925843238830566, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7121270895004272, + "num_tokens": 110060494.0, + "step": 4333 + }, + { + "epoch": 0.47594992312760814, + "grad_norm": 2.1734273433685303, + "learning_rate": 1e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.718401312828064, + "num_tokens": 110085070.0, + "step": 4334 + }, + { + "epoch": 0.47605974083022184, + "grad_norm": 2.332427501678467, + "learning_rate": 1e-06, + "loss": 0.922, + "mean_token_accuracy": 0.7102888226509094, + "num_tokens": 110106587.0, + "step": 4335 + }, + { + "epoch": 0.4761695585328355, + "grad_norm": 2.308950185775757, + "learning_rate": 1e-06, + "loss": 1.0459, + "mean_token_accuracy": 0.6860594153404236, + "num_tokens": 110132053.0, + "step": 4336 + }, + { + "epoch": 0.47627937623544914, + "grad_norm": 2.2129735946655273, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7189251184463501, + "num_tokens": 110155766.0, + "step": 4337 + }, + { + "epoch": 0.47638919393806284, + "grad_norm": 2.0642693042755127, + "learning_rate": 1e-06, + "loss": 1.0208, + "mean_token_accuracy": 0.6928993463516235, + "num_tokens": 110183471.0, + "step": 4338 + }, + { + "epoch": 0.4764990116406765, + "grad_norm": 2.676378011703491, + "learning_rate": 1e-06, + "loss": 0.9356, + "mean_token_accuracy": 0.7168210744857788, + "num_tokens": 110203526.0, + "step": 4339 + }, + { + "epoch": 0.47660882934329013, + "grad_norm": 2.514383554458618, + "learning_rate": 1e-06, + "loss": 0.9709, + "mean_token_accuracy": 0.705573558807373, + "num_tokens": 110225089.0, + "step": 4340 + }, + { + "epoch": 0.4767186470459038, + "grad_norm": 2.8324224948883057, + "learning_rate": 1e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7151762843132019, + "num_tokens": 110241619.0, + "step": 4341 + }, + { + "epoch": 0.4768284647485175, + "grad_norm": 2.0961012840270996, + "learning_rate": 1e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7106834053993225, + "num_tokens": 110268626.0, + "step": 4342 + }, + { + "epoch": 0.4769382824511311, + "grad_norm": 2.53021502494812, + "learning_rate": 1e-06, + "loss": 0.9089, + "mean_token_accuracy": 0.7242211103439331, + "num_tokens": 110289849.0, + "step": 4343 + }, + { + "epoch": 0.47704810015374477, + "grad_norm": 2.4463322162628174, + "learning_rate": 1e-06, + "loss": 1.044, + "mean_token_accuracy": 0.6903213262557983, + "num_tokens": 110311684.0, + "step": 4344 + }, + { + "epoch": 0.47715791785635847, + "grad_norm": 1.9691510200500488, + "learning_rate": 1e-06, + "loss": 1.0488, + "mean_token_accuracy": 0.6856988668441772, + "num_tokens": 110343787.0, + "step": 4345 + }, + { + "epoch": 0.4772677355589721, + "grad_norm": 2.119527816772461, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7216593623161316, + "num_tokens": 110369285.0, + "step": 4346 + }, + { + "epoch": 0.47737755326158576, + "grad_norm": 2.2655539512634277, + "learning_rate": 1e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.7074335813522339, + "num_tokens": 110394263.0, + "step": 4347 + }, + { + "epoch": 0.4774873709641994, + "grad_norm": 2.0780835151672363, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.7049487233161926, + "num_tokens": 110421034.0, + "step": 4348 + }, + { + "epoch": 0.4775971886668131, + "grad_norm": 2.4240963459014893, + "learning_rate": 1e-06, + "loss": 0.8965, + "mean_token_accuracy": 0.7146903276443481, + "num_tokens": 110440457.0, + "step": 4349 + }, + { + "epoch": 0.47770700636942676, + "grad_norm": 1.911468267440796, + "learning_rate": 1e-06, + "loss": 1.0066, + "mean_token_accuracy": 0.6951608657836914, + "num_tokens": 110473395.0, + "step": 4350 + }, + { + "epoch": 0.4778168240720404, + "grad_norm": 2.3421125411987305, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7037608623504639, + "num_tokens": 110498275.0, + "step": 4351 + }, + { + "epoch": 0.47792664177465405, + "grad_norm": 2.1496083736419678, + "learning_rate": 1e-06, + "loss": 1.0171, + "mean_token_accuracy": 0.689979076385498, + "num_tokens": 110524244.0, + "step": 4352 + }, + { + "epoch": 0.47803645947726775, + "grad_norm": 1.9956214427947998, + "learning_rate": 1e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.6961621046066284, + "num_tokens": 110557499.0, + "step": 4353 + }, + { + "epoch": 0.4781462771798814, + "grad_norm": 2.5487797260284424, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.7097508907318115, + "num_tokens": 110577916.0, + "step": 4354 + }, + { + "epoch": 0.47825609488249504, + "grad_norm": 2.4638795852661133, + "learning_rate": 1e-06, + "loss": 0.9317, + "mean_token_accuracy": 0.7152948379516602, + "num_tokens": 110597697.0, + "step": 4355 + }, + { + "epoch": 0.47836591258510874, + "grad_norm": 2.187574625015259, + "learning_rate": 1e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7235908508300781, + "num_tokens": 110621498.0, + "step": 4356 + }, + { + "epoch": 0.4784757302877224, + "grad_norm": 2.1282079219818115, + "learning_rate": 1e-06, + "loss": 0.9851, + "mean_token_accuracy": 0.7012736797332764, + "num_tokens": 110648479.0, + "step": 4357 + }, + { + "epoch": 0.47858554799033604, + "grad_norm": 2.043874502182007, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.6972212791442871, + "num_tokens": 110679280.0, + "step": 4358 + }, + { + "epoch": 0.4786953656929497, + "grad_norm": 2.3070061206817627, + "learning_rate": 1e-06, + "loss": 0.9954, + "mean_token_accuracy": 0.7039370536804199, + "num_tokens": 110703640.0, + "step": 4359 + }, + { + "epoch": 0.4788051833955634, + "grad_norm": 2.1298787593841553, + "learning_rate": 1e-06, + "loss": 1.0185, + "mean_token_accuracy": 0.6999351978302002, + "num_tokens": 110732023.0, + "step": 4360 + }, + { + "epoch": 0.47891500109817703, + "grad_norm": 2.608638048171997, + "learning_rate": 1e-06, + "loss": 0.9103, + "mean_token_accuracy": 0.7170605659484863, + "num_tokens": 110750498.0, + "step": 4361 + }, + { + "epoch": 0.4790248188007907, + "grad_norm": 2.442681312561035, + "learning_rate": 1e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.7115539908409119, + "num_tokens": 110770607.0, + "step": 4362 + }, + { + "epoch": 0.4791346365034044, + "grad_norm": 2.572664260864258, + "learning_rate": 1e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.70138019323349, + "num_tokens": 110791089.0, + "step": 4363 + }, + { + "epoch": 0.479244454206018, + "grad_norm": 2.2798144817352295, + "learning_rate": 1e-06, + "loss": 0.8799, + "mean_token_accuracy": 0.7238532900810242, + "num_tokens": 110815060.0, + "step": 4364 + }, + { + "epoch": 0.47935427190863167, + "grad_norm": 2.092942476272583, + "learning_rate": 1e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7079898118972778, + "num_tokens": 110842735.0, + "step": 4365 + }, + { + "epoch": 0.4794640896112453, + "grad_norm": 2.1234540939331055, + "learning_rate": 1e-06, + "loss": 1.0343, + "mean_token_accuracy": 0.6897666454315186, + "num_tokens": 110870692.0, + "step": 4366 + }, + { + "epoch": 0.479573907313859, + "grad_norm": 2.067704916000366, + "learning_rate": 1e-06, + "loss": 1.0345, + "mean_token_accuracy": 0.6852515935897827, + "num_tokens": 110899759.0, + "step": 4367 + }, + { + "epoch": 0.47968372501647266, + "grad_norm": 2.252964496612549, + "learning_rate": 1e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.6841205358505249, + "num_tokens": 110923995.0, + "step": 4368 + }, + { + "epoch": 0.4797935427190863, + "grad_norm": 2.1531307697296143, + "learning_rate": 1e-06, + "loss": 1.0329, + "mean_token_accuracy": 0.6841391324996948, + "num_tokens": 110951256.0, + "step": 4369 + }, + { + "epoch": 0.47990336042169995, + "grad_norm": 2.137601852416992, + "learning_rate": 1e-06, + "loss": 1.0529, + "mean_token_accuracy": 0.6900572776794434, + "num_tokens": 110978789.0, + "step": 4370 + }, + { + "epoch": 0.48001317812431366, + "grad_norm": 2.212838888168335, + "learning_rate": 1e-06, + "loss": 0.875, + "mean_token_accuracy": 0.7264878153800964, + "num_tokens": 111002693.0, + "step": 4371 + }, + { + "epoch": 0.4801229958269273, + "grad_norm": 2.2530710697174072, + "learning_rate": 1e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.7026276588439941, + "num_tokens": 111028712.0, + "step": 4372 + }, + { + "epoch": 0.48023281352954095, + "grad_norm": 2.5813958644866943, + "learning_rate": 1e-06, + "loss": 1.0377, + "mean_token_accuracy": 0.6924343109130859, + "num_tokens": 111049632.0, + "step": 4373 + }, + { + "epoch": 0.48034263123215465, + "grad_norm": 2.141023874282837, + "learning_rate": 1e-06, + "loss": 1.0448, + "mean_token_accuracy": 0.681088924407959, + "num_tokens": 111078816.0, + "step": 4374 + }, + { + "epoch": 0.4804524489347683, + "grad_norm": 2.1515796184539795, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.7063442468643188, + "num_tokens": 111105731.0, + "step": 4375 + }, + { + "epoch": 0.48056226663738194, + "grad_norm": 2.0508058071136475, + "learning_rate": 1e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.6969788074493408, + "num_tokens": 111133207.0, + "step": 4376 + }, + { + "epoch": 0.4806720843399956, + "grad_norm": 2.371175527572632, + "learning_rate": 1e-06, + "loss": 0.9316, + "mean_token_accuracy": 0.7145304083824158, + "num_tokens": 111154896.0, + "step": 4377 + }, + { + "epoch": 0.4807819020426093, + "grad_norm": 2.128809928894043, + "learning_rate": 1e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.7239418029785156, + "num_tokens": 111180395.0, + "step": 4378 + }, + { + "epoch": 0.48089171974522293, + "grad_norm": 2.428229331970215, + "learning_rate": 1e-06, + "loss": 0.9552, + "mean_token_accuracy": 0.7126066088676453, + "num_tokens": 111202918.0, + "step": 4379 + }, + { + "epoch": 0.4810015374478366, + "grad_norm": 2.494891405105591, + "learning_rate": 1e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.691962718963623, + "num_tokens": 111223786.0, + "step": 4380 + }, + { + "epoch": 0.4811113551504502, + "grad_norm": 2.1917929649353027, + "learning_rate": 1e-06, + "loss": 0.9869, + "mean_token_accuracy": 0.7058649063110352, + "num_tokens": 111249328.0, + "step": 4381 + }, + { + "epoch": 0.4812211728530639, + "grad_norm": 2.082979440689087, + "learning_rate": 1e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.7020094990730286, + "num_tokens": 111275795.0, + "step": 4382 + }, + { + "epoch": 0.4813309905556776, + "grad_norm": 2.3324925899505615, + "learning_rate": 1e-06, + "loss": 0.907, + "mean_token_accuracy": 0.7152969837188721, + "num_tokens": 111296880.0, + "step": 4383 + }, + { + "epoch": 0.4814408082582912, + "grad_norm": 2.277056932449341, + "learning_rate": 1e-06, + "loss": 1.0134, + "mean_token_accuracy": 0.6959356665611267, + "num_tokens": 111322557.0, + "step": 4384 + }, + { + "epoch": 0.4815506259609049, + "grad_norm": 2.499351978302002, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7046407461166382, + "num_tokens": 111342367.0, + "step": 4385 + }, + { + "epoch": 0.48166044366351857, + "grad_norm": 2.2105116844177246, + "learning_rate": 1e-06, + "loss": 0.8597, + "mean_token_accuracy": 0.734049916267395, + "num_tokens": 111367576.0, + "step": 4386 + }, + { + "epoch": 0.4817702613661322, + "grad_norm": 2.6882307529449463, + "learning_rate": 1e-06, + "loss": 0.9249, + "mean_token_accuracy": 0.707904040813446, + "num_tokens": 111386310.0, + "step": 4387 + }, + { + "epoch": 0.48188007906874586, + "grad_norm": 2.2267110347747803, + "learning_rate": 1e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.7170778512954712, + "num_tokens": 111408920.0, + "step": 4388 + }, + { + "epoch": 0.48198989677135956, + "grad_norm": 2.2224104404449463, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7121263742446899, + "num_tokens": 111432403.0, + "step": 4389 + }, + { + "epoch": 0.4820997144739732, + "grad_norm": 2.4557695388793945, + "learning_rate": 1e-06, + "loss": 0.9801, + "mean_token_accuracy": 0.7012096047401428, + "num_tokens": 111454635.0, + "step": 4390 + }, + { + "epoch": 0.48220953217658685, + "grad_norm": 2.164395570755005, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.7141143083572388, + "num_tokens": 111480932.0, + "step": 4391 + }, + { + "epoch": 0.48231934987920055, + "grad_norm": 2.071195125579834, + "learning_rate": 1e-06, + "loss": 1.0265, + "mean_token_accuracy": 0.6946651935577393, + "num_tokens": 111509238.0, + "step": 4392 + }, + { + "epoch": 0.4824291675818142, + "grad_norm": 2.4843361377716064, + "learning_rate": 1e-06, + "loss": 0.9308, + "mean_token_accuracy": 0.7118138074874878, + "num_tokens": 111529394.0, + "step": 4393 + }, + { + "epoch": 0.48253898528442785, + "grad_norm": 2.9502758979797363, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7087229490280151, + "num_tokens": 111546244.0, + "step": 4394 + }, + { + "epoch": 0.4826488029870415, + "grad_norm": 2.017486333847046, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.7032613158226013, + "num_tokens": 111574855.0, + "step": 4395 + }, + { + "epoch": 0.4827586206896552, + "grad_norm": 2.045844316482544, + "learning_rate": 1e-06, + "loss": 0.9026, + "mean_token_accuracy": 0.7263633012771606, + "num_tokens": 111600111.0, + "step": 4396 + }, + { + "epoch": 0.48286843839226884, + "grad_norm": 2.4972357749938965, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.7079954147338867, + "num_tokens": 111621932.0, + "step": 4397 + }, + { + "epoch": 0.4829782560948825, + "grad_norm": 2.4724466800689697, + "learning_rate": 1e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.6966862678527832, + "num_tokens": 111642064.0, + "step": 4398 + }, + { + "epoch": 0.48308807379749613, + "grad_norm": 2.5233747959136963, + "learning_rate": 1e-06, + "loss": 1.0101, + "mean_token_accuracy": 0.7041122317314148, + "num_tokens": 111662741.0, + "step": 4399 + }, + { + "epoch": 0.48319789150010983, + "grad_norm": 2.3461194038391113, + "learning_rate": 1e-06, + "loss": 0.8779, + "mean_token_accuracy": 0.7279423475265503, + "num_tokens": 111683849.0, + "step": 4400 + }, + { + "epoch": 0.4833077092027235, + "grad_norm": 2.1252615451812744, + "learning_rate": 1e-06, + "loss": 0.9594, + "mean_token_accuracy": 0.7094597220420837, + "num_tokens": 111709473.0, + "step": 4401 + }, + { + "epoch": 0.4834175269053371, + "grad_norm": 2.289308786392212, + "learning_rate": 1e-06, + "loss": 1.0298, + "mean_token_accuracy": 0.6908112168312073, + "num_tokens": 111732746.0, + "step": 4402 + }, + { + "epoch": 0.4835273446079508, + "grad_norm": 2.1952803134918213, + "learning_rate": 1e-06, + "loss": 1.069, + "mean_token_accuracy": 0.6790679693222046, + "num_tokens": 111757827.0, + "step": 4403 + }, + { + "epoch": 0.4836371623105645, + "grad_norm": 2.0219430923461914, + "learning_rate": 1e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.7039255499839783, + "num_tokens": 111785560.0, + "step": 4404 + }, + { + "epoch": 0.4837469800131781, + "grad_norm": 2.0701792240142822, + "learning_rate": 1e-06, + "loss": 0.9223, + "mean_token_accuracy": 0.7172147631645203, + "num_tokens": 111812953.0, + "step": 4405 + }, + { + "epoch": 0.48385679771579176, + "grad_norm": 2.102524995803833, + "learning_rate": 1e-06, + "loss": 0.9839, + "mean_token_accuracy": 0.7114729881286621, + "num_tokens": 111842051.0, + "step": 4406 + }, + { + "epoch": 0.48396661541840547, + "grad_norm": 2.2399165630340576, + "learning_rate": 1e-06, + "loss": 1.0691, + "mean_token_accuracy": 0.6880731582641602, + "num_tokens": 111869896.0, + "step": 4407 + }, + { + "epoch": 0.4840764331210191, + "grad_norm": 2.230520725250244, + "learning_rate": 1e-06, + "loss": 1.0001, + "mean_token_accuracy": 0.7011663913726807, + "num_tokens": 111894738.0, + "step": 4408 + }, + { + "epoch": 0.48418625082363276, + "grad_norm": 2.454345226287842, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7060556411743164, + "num_tokens": 111915815.0, + "step": 4409 + }, + { + "epoch": 0.4842960685262464, + "grad_norm": 2.136808156967163, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.7007410526275635, + "num_tokens": 111944855.0, + "step": 4410 + }, + { + "epoch": 0.4844058862288601, + "grad_norm": 2.529416084289551, + "learning_rate": 1e-06, + "loss": 0.881, + "mean_token_accuracy": 0.7287459373474121, + "num_tokens": 111964988.0, + "step": 4411 + }, + { + "epoch": 0.48451570393147375, + "grad_norm": 2.3156118392944336, + "learning_rate": 1e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7158999443054199, + "num_tokens": 111988896.0, + "step": 4412 + }, + { + "epoch": 0.4846255216340874, + "grad_norm": 2.3726682662963867, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7088617086410522, + "num_tokens": 112010301.0, + "step": 4413 + }, + { + "epoch": 0.4847353393367011, + "grad_norm": 2.355546236038208, + "learning_rate": 1e-06, + "loss": 0.9773, + "mean_token_accuracy": 0.711661696434021, + "num_tokens": 112033682.0, + "step": 4414 + }, + { + "epoch": 0.48484515703931474, + "grad_norm": 2.2803831100463867, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.697679340839386, + "num_tokens": 112059480.0, + "step": 4415 + }, + { + "epoch": 0.4849549747419284, + "grad_norm": 2.763190746307373, + "learning_rate": 1e-06, + "loss": 0.8987, + "mean_token_accuracy": 0.7155139446258545, + "num_tokens": 112076523.0, + "step": 4416 + }, + { + "epoch": 0.48506479244454204, + "grad_norm": 2.3905324935913086, + "learning_rate": 1e-06, + "loss": 0.9546, + "mean_token_accuracy": 0.7126510143280029, + "num_tokens": 112098640.0, + "step": 4417 + }, + { + "epoch": 0.48517461014715574, + "grad_norm": 2.2519357204437256, + "learning_rate": 1e-06, + "loss": 0.9722, + "mean_token_accuracy": 0.7076946496963501, + "num_tokens": 112123083.0, + "step": 4418 + }, + { + "epoch": 0.4852844278497694, + "grad_norm": 2.182316541671753, + "learning_rate": 1e-06, + "loss": 0.8927, + "mean_token_accuracy": 0.7230374217033386, + "num_tokens": 112147520.0, + "step": 4419 + }, + { + "epoch": 0.48539424555238303, + "grad_norm": 2.122349739074707, + "learning_rate": 1e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.6936309337615967, + "num_tokens": 112173754.0, + "step": 4420 + }, + { + "epoch": 0.48550406325499673, + "grad_norm": 2.4576363563537598, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.7042658925056458, + "num_tokens": 112195153.0, + "step": 4421 + }, + { + "epoch": 0.4856138809576104, + "grad_norm": 2.486379384994507, + "learning_rate": 1e-06, + "loss": 0.9804, + "mean_token_accuracy": 0.6981505155563354, + "num_tokens": 112217569.0, + "step": 4422 + }, + { + "epoch": 0.485723698660224, + "grad_norm": 2.0344626903533936, + "learning_rate": 1e-06, + "loss": 1.0696, + "mean_token_accuracy": 0.6842302083969116, + "num_tokens": 112246160.0, + "step": 4423 + }, + { + "epoch": 0.48583351636283767, + "grad_norm": 2.488624334335327, + "learning_rate": 1e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.7354404926300049, + "num_tokens": 112267045.0, + "step": 4424 + }, + { + "epoch": 0.48594333406545137, + "grad_norm": 2.3995604515075684, + "learning_rate": 1e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.7127734422683716, + "num_tokens": 112287076.0, + "step": 4425 + }, + { + "epoch": 0.486053151768065, + "grad_norm": 2.558095932006836, + "learning_rate": 1e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.7121711373329163, + "num_tokens": 112306364.0, + "step": 4426 + }, + { + "epoch": 0.48616296947067866, + "grad_norm": 1.8997124433517456, + "learning_rate": 1e-06, + "loss": 1.0345, + "mean_token_accuracy": 0.6857671737670898, + "num_tokens": 112337722.0, + "step": 4427 + }, + { + "epoch": 0.4862727871732923, + "grad_norm": 2.0987136363983154, + "learning_rate": 1e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.7021199464797974, + "num_tokens": 112366237.0, + "step": 4428 + }, + { + "epoch": 0.486382604875906, + "grad_norm": 2.221505880355835, + "learning_rate": 1e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.7016159296035767, + "num_tokens": 112391592.0, + "step": 4429 + }, + { + "epoch": 0.48649242257851966, + "grad_norm": 2.2443742752075195, + "learning_rate": 1e-06, + "loss": 0.9801, + "mean_token_accuracy": 0.6963193416595459, + "num_tokens": 112417731.0, + "step": 4430 + }, + { + "epoch": 0.4866022402811333, + "grad_norm": 2.020012378692627, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7017800807952881, + "num_tokens": 112446741.0, + "step": 4431 + }, + { + "epoch": 0.486712057983747, + "grad_norm": 1.9596501588821411, + "learning_rate": 1e-06, + "loss": 1.0994, + "mean_token_accuracy": 0.6745570302009583, + "num_tokens": 112480875.0, + "step": 4432 + }, + { + "epoch": 0.48682187568636065, + "grad_norm": 2.2535879611968994, + "learning_rate": 1e-06, + "loss": 1.0278, + "mean_token_accuracy": 0.681499719619751, + "num_tokens": 112505368.0, + "step": 4433 + }, + { + "epoch": 0.4869316933889743, + "grad_norm": 2.061007022857666, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.7068670988082886, + "num_tokens": 112533767.0, + "step": 4434 + }, + { + "epoch": 0.48704151109158794, + "grad_norm": 2.4198665618896484, + "learning_rate": 1e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.7021592855453491, + "num_tokens": 112557615.0, + "step": 4435 + }, + { + "epoch": 0.48715132879420164, + "grad_norm": 1.8879690170288086, + "learning_rate": 1e-06, + "loss": 1.0125, + "mean_token_accuracy": 0.7020409107208252, + "num_tokens": 112589032.0, + "step": 4436 + }, + { + "epoch": 0.4872611464968153, + "grad_norm": 2.4104676246643066, + "learning_rate": 1e-06, + "loss": 1.0385, + "mean_token_accuracy": 0.6936840415000916, + "num_tokens": 112608967.0, + "step": 4437 + }, + { + "epoch": 0.48737096419942894, + "grad_norm": 1.9846807718276978, + "learning_rate": 1e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.7006350755691528, + "num_tokens": 112641253.0, + "step": 4438 + }, + { + "epoch": 0.48748078190204264, + "grad_norm": 2.1813173294067383, + "learning_rate": 1e-06, + "loss": 1.0048, + "mean_token_accuracy": 0.6895313262939453, + "num_tokens": 112668178.0, + "step": 4439 + }, + { + "epoch": 0.4875905996046563, + "grad_norm": 2.221505641937256, + "learning_rate": 1e-06, + "loss": 0.9798, + "mean_token_accuracy": 0.7008800506591797, + "num_tokens": 112693628.0, + "step": 4440 + }, + { + "epoch": 0.48770041730726993, + "grad_norm": 2.110565185546875, + "learning_rate": 1e-06, + "loss": 0.8881, + "mean_token_accuracy": 0.7321476936340332, + "num_tokens": 112718613.0, + "step": 4441 + }, + { + "epoch": 0.4878102350098836, + "grad_norm": 2.2815401554107666, + "learning_rate": 1e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7165353894233704, + "num_tokens": 112742442.0, + "step": 4442 + }, + { + "epoch": 0.4879200527124973, + "grad_norm": 2.1321585178375244, + "learning_rate": 1e-06, + "loss": 1.0078, + "mean_token_accuracy": 0.6984786987304688, + "num_tokens": 112769763.0, + "step": 4443 + }, + { + "epoch": 0.4880298704151109, + "grad_norm": 2.48398756980896, + "learning_rate": 1e-06, + "loss": 0.8712, + "mean_token_accuracy": 0.733877420425415, + "num_tokens": 112788115.0, + "step": 4444 + }, + { + "epoch": 0.48813968811772457, + "grad_norm": 2.2498462200164795, + "learning_rate": 1e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.6908376216888428, + "num_tokens": 112812862.0, + "step": 4445 + }, + { + "epoch": 0.4882495058203382, + "grad_norm": 2.4166126251220703, + "learning_rate": 1e-06, + "loss": 0.9799, + "mean_token_accuracy": 0.7018688917160034, + "num_tokens": 112835144.0, + "step": 4446 + }, + { + "epoch": 0.4883593235229519, + "grad_norm": 2.3765640258789062, + "learning_rate": 1e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.7071850299835205, + "num_tokens": 112858093.0, + "step": 4447 + }, + { + "epoch": 0.48846914122556556, + "grad_norm": 2.4497451782226562, + "learning_rate": 1e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.720446765422821, + "num_tokens": 112877897.0, + "step": 4448 + }, + { + "epoch": 0.4885789589281792, + "grad_norm": 2.2438158988952637, + "learning_rate": 1e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.709084689617157, + "num_tokens": 112902084.0, + "step": 4449 + }, + { + "epoch": 0.4886887766307929, + "grad_norm": 2.389476776123047, + "learning_rate": 1e-06, + "loss": 0.8922, + "mean_token_accuracy": 0.7244527339935303, + "num_tokens": 112923088.0, + "step": 4450 + }, + { + "epoch": 0.48879859433340656, + "grad_norm": 2.1516242027282715, + "learning_rate": 1e-06, + "loss": 1.0317, + "mean_token_accuracy": 0.6937862634658813, + "num_tokens": 112951031.0, + "step": 4451 + }, + { + "epoch": 0.4889084120360202, + "grad_norm": 2.088078260421753, + "learning_rate": 1e-06, + "loss": 1.0366, + "mean_token_accuracy": 0.6817495226860046, + "num_tokens": 112979145.0, + "step": 4452 + }, + { + "epoch": 0.48901822973863385, + "grad_norm": 1.9812512397766113, + "learning_rate": 1e-06, + "loss": 0.972, + "mean_token_accuracy": 0.7126156687736511, + "num_tokens": 113008744.0, + "step": 4453 + }, + { + "epoch": 0.48912804744124755, + "grad_norm": 2.159238338470459, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7211064100265503, + "num_tokens": 113032516.0, + "step": 4454 + }, + { + "epoch": 0.4892378651438612, + "grad_norm": 2.1582517623901367, + "learning_rate": 1e-06, + "loss": 1.0319, + "mean_token_accuracy": 0.6834102869033813, + "num_tokens": 113060837.0, + "step": 4455 + }, + { + "epoch": 0.48934768284647484, + "grad_norm": 2.422081232070923, + "learning_rate": 1e-06, + "loss": 1.0571, + "mean_token_accuracy": 0.6863332390785217, + "num_tokens": 113084476.0, + "step": 4456 + }, + { + "epoch": 0.4894575005490885, + "grad_norm": 2.1816282272338867, + "learning_rate": 1e-06, + "loss": 1.0499, + "mean_token_accuracy": 0.6833891868591309, + "num_tokens": 113110190.0, + "step": 4457 + }, + { + "epoch": 0.4895673182517022, + "grad_norm": 2.1712777614593506, + "learning_rate": 1e-06, + "loss": 0.962, + "mean_token_accuracy": 0.71772301197052, + "num_tokens": 113138143.0, + "step": 4458 + }, + { + "epoch": 0.48967713595431583, + "grad_norm": 2.420088768005371, + "learning_rate": 1e-06, + "loss": 1.0435, + "mean_token_accuracy": 0.6873198747634888, + "num_tokens": 113159528.0, + "step": 4459 + }, + { + "epoch": 0.4897869536569295, + "grad_norm": 2.496814012527466, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7175446152687073, + "num_tokens": 113179880.0, + "step": 4460 + }, + { + "epoch": 0.4898967713595432, + "grad_norm": 2.2797493934631348, + "learning_rate": 1e-06, + "loss": 1.0594, + "mean_token_accuracy": 0.6873148083686829, + "num_tokens": 113204985.0, + "step": 4461 + }, + { + "epoch": 0.4900065890621568, + "grad_norm": 2.0095579624176025, + "learning_rate": 1e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.7033731341362, + "num_tokens": 113234482.0, + "step": 4462 + }, + { + "epoch": 0.4901164067647705, + "grad_norm": 2.050370931625366, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.7101695537567139, + "num_tokens": 113262228.0, + "step": 4463 + }, + { + "epoch": 0.4902262244673841, + "grad_norm": 2.414841651916504, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7165277600288391, + "num_tokens": 113283957.0, + "step": 4464 + }, + { + "epoch": 0.4903360421699978, + "grad_norm": 2.93405818939209, + "learning_rate": 1e-06, + "loss": 0.8729, + "mean_token_accuracy": 0.7241179347038269, + "num_tokens": 113299829.0, + "step": 4465 + }, + { + "epoch": 0.49044585987261147, + "grad_norm": 2.1277687549591064, + "learning_rate": 1e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7190577387809753, + "num_tokens": 113324309.0, + "step": 4466 + }, + { + "epoch": 0.4905556775752251, + "grad_norm": 2.108339309692383, + "learning_rate": 1e-06, + "loss": 1.0139, + "mean_token_accuracy": 0.7017008662223816, + "num_tokens": 113352393.0, + "step": 4467 + }, + { + "epoch": 0.4906654952778388, + "grad_norm": 1.8641316890716553, + "learning_rate": 1e-06, + "loss": 1.0035, + "mean_token_accuracy": 0.6987582445144653, + "num_tokens": 113385289.0, + "step": 4468 + }, + { + "epoch": 0.49077531298045246, + "grad_norm": 2.125150442123413, + "learning_rate": 1e-06, + "loss": 0.9992, + "mean_token_accuracy": 0.6979649066925049, + "num_tokens": 113412407.0, + "step": 4469 + }, + { + "epoch": 0.4908851306830661, + "grad_norm": 1.7929250001907349, + "learning_rate": 1e-06, + "loss": 1.0718, + "mean_token_accuracy": 0.6765687465667725, + "num_tokens": 113452328.0, + "step": 4470 + }, + { + "epoch": 0.49099494838567975, + "grad_norm": 2.2809395790100098, + "learning_rate": 1e-06, + "loss": 0.9944, + "mean_token_accuracy": 0.6946451663970947, + "num_tokens": 113475815.0, + "step": 4471 + }, + { + "epoch": 0.49110476608829345, + "grad_norm": 2.034848928451538, + "learning_rate": 1e-06, + "loss": 1.0286, + "mean_token_accuracy": 0.6870081424713135, + "num_tokens": 113505169.0, + "step": 4472 + }, + { + "epoch": 0.4912145837909071, + "grad_norm": 1.992266297340393, + "learning_rate": 1e-06, + "loss": 0.9839, + "mean_token_accuracy": 0.6971443891525269, + "num_tokens": 113533406.0, + "step": 4473 + }, + { + "epoch": 0.49132440149352075, + "grad_norm": 2.8257758617401123, + "learning_rate": 1e-06, + "loss": 0.8914, + "mean_token_accuracy": 0.7205921411514282, + "num_tokens": 113550123.0, + "step": 4474 + }, + { + "epoch": 0.4914342191961344, + "grad_norm": 1.9980177879333496, + "learning_rate": 1e-06, + "loss": 1.0828, + "mean_token_accuracy": 0.6699547171592712, + "num_tokens": 113581425.0, + "step": 4475 + }, + { + "epoch": 0.4915440368987481, + "grad_norm": 2.475372314453125, + "learning_rate": 1e-06, + "loss": 0.9128, + "mean_token_accuracy": 0.7208934426307678, + "num_tokens": 113601740.0, + "step": 4476 + }, + { + "epoch": 0.49165385460136174, + "grad_norm": 2.124762773513794, + "learning_rate": 1e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7111822366714478, + "num_tokens": 113628575.0, + "step": 4477 + }, + { + "epoch": 0.4917636723039754, + "grad_norm": 2.0230281352996826, + "learning_rate": 1e-06, + "loss": 0.997, + "mean_token_accuracy": 0.6937575340270996, + "num_tokens": 113660001.0, + "step": 4478 + }, + { + "epoch": 0.4918734900065891, + "grad_norm": 1.9918479919433594, + "learning_rate": 1e-06, + "loss": 0.9923, + "mean_token_accuracy": 0.6990691423416138, + "num_tokens": 113689712.0, + "step": 4479 + }, + { + "epoch": 0.49198330770920273, + "grad_norm": 2.290677785873413, + "learning_rate": 1e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.7097440958023071, + "num_tokens": 113714121.0, + "step": 4480 + }, + { + "epoch": 0.4920931254118164, + "grad_norm": 2.150583028793335, + "learning_rate": 1e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.7061084508895874, + "num_tokens": 113740010.0, + "step": 4481 + }, + { + "epoch": 0.49220294311443, + "grad_norm": 2.175006151199341, + "learning_rate": 1e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.7253222465515137, + "num_tokens": 113766486.0, + "step": 4482 + }, + { + "epoch": 0.4923127608170437, + "grad_norm": 2.0378646850585938, + "learning_rate": 1e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.6975817680358887, + "num_tokens": 113796245.0, + "step": 4483 + }, + { + "epoch": 0.4924225785196574, + "grad_norm": 2.206242561340332, + "learning_rate": 1e-06, + "loss": 1.0248, + "mean_token_accuracy": 0.6893282532691956, + "num_tokens": 113821058.0, + "step": 4484 + }, + { + "epoch": 0.492532396222271, + "grad_norm": 2.606274127960205, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7130675911903381, + "num_tokens": 113841270.0, + "step": 4485 + }, + { + "epoch": 0.49264221392488466, + "grad_norm": 2.2542991638183594, + "learning_rate": 1e-06, + "loss": 0.999, + "mean_token_accuracy": 0.6990090608596802, + "num_tokens": 113868067.0, + "step": 4486 + }, + { + "epoch": 0.49275203162749837, + "grad_norm": 2.040004253387451, + "learning_rate": 1e-06, + "loss": 0.9909, + "mean_token_accuracy": 0.7062040567398071, + "num_tokens": 113897513.0, + "step": 4487 + }, + { + "epoch": 0.492861849330112, + "grad_norm": 2.3604812622070312, + "learning_rate": 1e-06, + "loss": 0.9603, + "mean_token_accuracy": 0.7110390663146973, + "num_tokens": 113922120.0, + "step": 4488 + }, + { + "epoch": 0.49297166703272566, + "grad_norm": 2.2037715911865234, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7115184664726257, + "num_tokens": 113946748.0, + "step": 4489 + }, + { + "epoch": 0.49308148473533936, + "grad_norm": 2.2511026859283447, + "learning_rate": 1e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.7017488479614258, + "num_tokens": 113971241.0, + "step": 4490 + }, + { + "epoch": 0.493191302437953, + "grad_norm": 2.2024857997894287, + "learning_rate": 1e-06, + "loss": 0.9823, + "mean_token_accuracy": 0.6998547315597534, + "num_tokens": 113996663.0, + "step": 4491 + }, + { + "epoch": 0.49330112014056665, + "grad_norm": 2.079667329788208, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7015946507453918, + "num_tokens": 114024579.0, + "step": 4492 + }, + { + "epoch": 0.4934109378431803, + "grad_norm": 2.2168374061584473, + "learning_rate": 1e-06, + "loss": 1.0186, + "mean_token_accuracy": 0.6971921324729919, + "num_tokens": 114050535.0, + "step": 4493 + }, + { + "epoch": 0.493520755545794, + "grad_norm": 2.3242874145507812, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7101160287857056, + "num_tokens": 114073999.0, + "step": 4494 + }, + { + "epoch": 0.49363057324840764, + "grad_norm": 2.290856122970581, + "learning_rate": 1e-06, + "loss": 1.0778, + "mean_token_accuracy": 0.678342878818512, + "num_tokens": 114097848.0, + "step": 4495 + }, + { + "epoch": 0.4937403909510213, + "grad_norm": 1.9289700984954834, + "learning_rate": 1e-06, + "loss": 1.0592, + "mean_token_accuracy": 0.6813488006591797, + "num_tokens": 114133010.0, + "step": 4496 + }, + { + "epoch": 0.493850208653635, + "grad_norm": 2.385343313217163, + "learning_rate": 1e-06, + "loss": 1.0218, + "mean_token_accuracy": 0.7012852430343628, + "num_tokens": 114156150.0, + "step": 4497 + }, + { + "epoch": 0.49396002635624864, + "grad_norm": 2.398958206176758, + "learning_rate": 1e-06, + "loss": 0.8638, + "mean_token_accuracy": 0.7242838144302368, + "num_tokens": 114177612.0, + "step": 4498 + }, + { + "epoch": 0.4940698440588623, + "grad_norm": 2.2750627994537354, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7152208089828491, + "num_tokens": 114200883.0, + "step": 4499 + }, + { + "epoch": 0.49417966176147593, + "grad_norm": 2.2347707748413086, + "learning_rate": 1e-06, + "loss": 1.0431, + "mean_token_accuracy": 0.6895021796226501, + "num_tokens": 114227950.0, + "step": 4500 + }, + { + "epoch": 0.49428947946408963, + "grad_norm": 2.2339751720428467, + "learning_rate": 1e-06, + "loss": 1.0611, + "mean_token_accuracy": 0.6998504996299744, + "num_tokens": 114251605.0, + "step": 4501 + }, + { + "epoch": 0.4943992971667033, + "grad_norm": 2.0484256744384766, + "learning_rate": 1e-06, + "loss": 0.99, + "mean_token_accuracy": 0.7001574039459229, + "num_tokens": 114280810.0, + "step": 4502 + }, + { + "epoch": 0.4945091148693169, + "grad_norm": 2.1741907596588135, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7081670761108398, + "num_tokens": 114306202.0, + "step": 4503 + }, + { + "epoch": 0.49461893257193057, + "grad_norm": 2.1806869506835938, + "learning_rate": 1e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.7190479040145874, + "num_tokens": 114331096.0, + "step": 4504 + }, + { + "epoch": 0.49472875027454427, + "grad_norm": 2.3140077590942383, + "learning_rate": 1e-06, + "loss": 0.8409, + "mean_token_accuracy": 0.7347243428230286, + "num_tokens": 114353699.0, + "step": 4505 + }, + { + "epoch": 0.4948385679771579, + "grad_norm": 2.2158138751983643, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7116495370864868, + "num_tokens": 114376011.0, + "step": 4506 + }, + { + "epoch": 0.49494838567977156, + "grad_norm": 2.2326197624206543, + "learning_rate": 1e-06, + "loss": 1.0283, + "mean_token_accuracy": 0.6896662712097168, + "num_tokens": 114401087.0, + "step": 4507 + }, + { + "epoch": 0.49505820338238526, + "grad_norm": 2.231869697570801, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7086995840072632, + "num_tokens": 114424571.0, + "step": 4508 + }, + { + "epoch": 0.4951680210849989, + "grad_norm": 2.1948063373565674, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7208573818206787, + "num_tokens": 114452708.0, + "step": 4509 + }, + { + "epoch": 0.49527783878761256, + "grad_norm": 2.3167672157287598, + "learning_rate": 1e-06, + "loss": 1.0884, + "mean_token_accuracy": 0.6814730167388916, + "num_tokens": 114479140.0, + "step": 4510 + }, + { + "epoch": 0.4953876564902262, + "grad_norm": 2.0688135623931885, + "learning_rate": 1e-06, + "loss": 0.9959, + "mean_token_accuracy": 0.7082586288452148, + "num_tokens": 114507825.0, + "step": 4511 + }, + { + "epoch": 0.4954974741928399, + "grad_norm": 2.405811071395874, + "learning_rate": 1e-06, + "loss": 1.0678, + "mean_token_accuracy": 0.6945143938064575, + "num_tokens": 114531372.0, + "step": 4512 + }, + { + "epoch": 0.49560729189545355, + "grad_norm": 2.1789743900299072, + "learning_rate": 1e-06, + "loss": 1.042, + "mean_token_accuracy": 0.6836209297180176, + "num_tokens": 114557312.0, + "step": 4513 + }, + { + "epoch": 0.4957171095980672, + "grad_norm": 2.238323926925659, + "learning_rate": 1e-06, + "loss": 0.8169, + "mean_token_accuracy": 0.7451380491256714, + "num_tokens": 114579038.0, + "step": 4514 + }, + { + "epoch": 0.4958269273006809, + "grad_norm": 2.4269752502441406, + "learning_rate": 1e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.7134202122688293, + "num_tokens": 114599140.0, + "step": 4515 + }, + { + "epoch": 0.49593674500329454, + "grad_norm": 2.109423875808716, + "learning_rate": 1e-06, + "loss": 1.0, + "mean_token_accuracy": 0.7111261487007141, + "num_tokens": 114627462.0, + "step": 4516 + }, + { + "epoch": 0.4960465627059082, + "grad_norm": 2.3594796657562256, + "learning_rate": 1e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.7071747779846191, + "num_tokens": 114648597.0, + "step": 4517 + }, + { + "epoch": 0.49615638040852184, + "grad_norm": 2.2558515071868896, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7052170038223267, + "num_tokens": 114673156.0, + "step": 4518 + }, + { + "epoch": 0.49626619811113554, + "grad_norm": 2.3400590419769287, + "learning_rate": 1e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.7043508887290955, + "num_tokens": 114694914.0, + "step": 4519 + }, + { + "epoch": 0.4963760158137492, + "grad_norm": 2.4604151248931885, + "learning_rate": 1e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7222054600715637, + "num_tokens": 114716651.0, + "step": 4520 + }, + { + "epoch": 0.49648583351636283, + "grad_norm": 2.03330659866333, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7050687074661255, + "num_tokens": 114745695.0, + "step": 4521 + }, + { + "epoch": 0.4965956512189765, + "grad_norm": 2.1297779083251953, + "learning_rate": 1e-06, + "loss": 1.0145, + "mean_token_accuracy": 0.6930672526359558, + "num_tokens": 114775948.0, + "step": 4522 + }, + { + "epoch": 0.4967054689215902, + "grad_norm": 2.312814712524414, + "learning_rate": 1e-06, + "loss": 1.0089, + "mean_token_accuracy": 0.7005640864372253, + "num_tokens": 114799891.0, + "step": 4523 + }, + { + "epoch": 0.4968152866242038, + "grad_norm": 2.0462558269500732, + "learning_rate": 1e-06, + "loss": 0.9316, + "mean_token_accuracy": 0.7171213030815125, + "num_tokens": 114828761.0, + "step": 4524 + }, + { + "epoch": 0.49692510432681747, + "grad_norm": 2.4595601558685303, + "learning_rate": 1e-06, + "loss": 0.9013, + "mean_token_accuracy": 0.7203376293182373, + "num_tokens": 114849133.0, + "step": 4525 + }, + { + "epoch": 0.49703492202943117, + "grad_norm": 2.382871150970459, + "learning_rate": 1e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7189456224441528, + "num_tokens": 114871412.0, + "step": 4526 + }, + { + "epoch": 0.4971447397320448, + "grad_norm": 2.192963123321533, + "learning_rate": 1e-06, + "loss": 0.9773, + "mean_token_accuracy": 0.7028458714485168, + "num_tokens": 114896311.0, + "step": 4527 + }, + { + "epoch": 0.49725455743465846, + "grad_norm": 2.474794387817383, + "learning_rate": 1e-06, + "loss": 0.9668, + "mean_token_accuracy": 0.7016145586967468, + "num_tokens": 114916514.0, + "step": 4528 + }, + { + "epoch": 0.4973643751372721, + "grad_norm": 2.2218825817108154, + "learning_rate": 1e-06, + "loss": 0.9523, + "mean_token_accuracy": 0.7100450992584229, + "num_tokens": 114940115.0, + "step": 4529 + }, + { + "epoch": 0.4974741928398858, + "grad_norm": 2.4179701805114746, + "learning_rate": 1e-06, + "loss": 0.8708, + "mean_token_accuracy": 0.7208852767944336, + "num_tokens": 114961374.0, + "step": 4530 + }, + { + "epoch": 0.49758401054249946, + "grad_norm": 2.2209012508392334, + "learning_rate": 1e-06, + "loss": 0.927, + "mean_token_accuracy": 0.7071857452392578, + "num_tokens": 114985381.0, + "step": 4531 + }, + { + "epoch": 0.4976938282451131, + "grad_norm": 2.2251768112182617, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.7079070210456848, + "num_tokens": 115011407.0, + "step": 4532 + }, + { + "epoch": 0.49780364594772675, + "grad_norm": 2.2099390029907227, + "learning_rate": 1e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.7028179168701172, + "num_tokens": 115035429.0, + "step": 4533 + }, + { + "epoch": 0.49791346365034045, + "grad_norm": 2.495548725128174, + "learning_rate": 1e-06, + "loss": 0.8552, + "mean_token_accuracy": 0.7424039840698242, + "num_tokens": 115055039.0, + "step": 4534 + }, + { + "epoch": 0.4980232813529541, + "grad_norm": 2.29770827293396, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.6961837410926819, + "num_tokens": 115077114.0, + "step": 4535 + }, + { + "epoch": 0.49813309905556774, + "grad_norm": 2.184980630874634, + "learning_rate": 1e-06, + "loss": 0.883, + "mean_token_accuracy": 0.7248475551605225, + "num_tokens": 115103566.0, + "step": 4536 + }, + { + "epoch": 0.49824291675818144, + "grad_norm": 2.3234992027282715, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7077268362045288, + "num_tokens": 115127562.0, + "step": 4537 + }, + { + "epoch": 0.4983527344607951, + "grad_norm": 2.2554330825805664, + "learning_rate": 1e-06, + "loss": 0.9971, + "mean_token_accuracy": 0.7001246809959412, + "num_tokens": 115151395.0, + "step": 4538 + }, + { + "epoch": 0.49846255216340873, + "grad_norm": 2.119702100753784, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7128984332084656, + "num_tokens": 115179209.0, + "step": 4539 + }, + { + "epoch": 0.4985723698660224, + "grad_norm": 2.2214698791503906, + "learning_rate": 1e-06, + "loss": 1.0456, + "mean_token_accuracy": 0.6877575516700745, + "num_tokens": 115204811.0, + "step": 4540 + }, + { + "epoch": 0.4986821875686361, + "grad_norm": 2.1842870712280273, + "learning_rate": 1e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.7082383036613464, + "num_tokens": 115230178.0, + "step": 4541 + }, + { + "epoch": 0.4987920052712497, + "grad_norm": 2.706928014755249, + "learning_rate": 1e-06, + "loss": 0.974, + "mean_token_accuracy": 0.7056263089179993, + "num_tokens": 115248019.0, + "step": 4542 + }, + { + "epoch": 0.4989018229738634, + "grad_norm": 2.335113763809204, + "learning_rate": 1e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.7061804533004761, + "num_tokens": 115270735.0, + "step": 4543 + }, + { + "epoch": 0.4990116406764771, + "grad_norm": 2.0454869270324707, + "learning_rate": 1e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.713101327419281, + "num_tokens": 115297321.0, + "step": 4544 + }, + { + "epoch": 0.4991214583790907, + "grad_norm": 2.5538206100463867, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7051810026168823, + "num_tokens": 115317457.0, + "step": 4545 + }, + { + "epoch": 0.49923127608170437, + "grad_norm": 2.137418270111084, + "learning_rate": 1e-06, + "loss": 1.001, + "mean_token_accuracy": 0.6977855563163757, + "num_tokens": 115344675.0, + "step": 4546 + }, + { + "epoch": 0.499341093784318, + "grad_norm": 2.2520852088928223, + "learning_rate": 1e-06, + "loss": 1.0328, + "mean_token_accuracy": 0.6862653493881226, + "num_tokens": 115371676.0, + "step": 4547 + }, + { + "epoch": 0.4994509114869317, + "grad_norm": 2.112677574157715, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.7034602761268616, + "num_tokens": 115398874.0, + "step": 4548 + }, + { + "epoch": 0.49956072918954536, + "grad_norm": 2.1996841430664062, + "learning_rate": 1e-06, + "loss": 1.0262, + "mean_token_accuracy": 0.6930637955665588, + "num_tokens": 115423318.0, + "step": 4549 + }, + { + "epoch": 0.499670546892159, + "grad_norm": 1.989686369895935, + "learning_rate": 1e-06, + "loss": 0.9871, + "mean_token_accuracy": 0.7056093215942383, + "num_tokens": 115451301.0, + "step": 4550 + }, + { + "epoch": 0.49978036459477265, + "grad_norm": 2.0323634147644043, + "learning_rate": 1e-06, + "loss": 1.0418, + "mean_token_accuracy": 0.6833981275558472, + "num_tokens": 115479865.0, + "step": 4551 + }, + { + "epoch": 0.49989018229738635, + "grad_norm": 2.311678171157837, + "learning_rate": 1e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.706037163734436, + "num_tokens": 115502525.0, + "step": 4552 + }, + { + "epoch": 0.5, + "grad_norm": 2.0682108402252197, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.7065460681915283, + "num_tokens": 115529666.0, + "step": 4553 + }, + { + "epoch": 0.5001098177026136, + "grad_norm": 2.1852779388427734, + "learning_rate": 1e-06, + "loss": 0.981, + "mean_token_accuracy": 0.7113522887229919, + "num_tokens": 115554545.0, + "step": 4554 + }, + { + "epoch": 0.5002196354052273, + "grad_norm": 2.149923086166382, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7095096111297607, + "num_tokens": 115579902.0, + "step": 4555 + }, + { + "epoch": 0.5003294531078409, + "grad_norm": 2.2820777893066406, + "learning_rate": 1e-06, + "loss": 1.064, + "mean_token_accuracy": 0.6789814233779907, + "num_tokens": 115605557.0, + "step": 4556 + }, + { + "epoch": 0.5004392708104547, + "grad_norm": 2.3138434886932373, + "learning_rate": 1e-06, + "loss": 1.0088, + "mean_token_accuracy": 0.6891275644302368, + "num_tokens": 115630009.0, + "step": 4557 + }, + { + "epoch": 0.5005490885130683, + "grad_norm": 2.0241036415100098, + "learning_rate": 1e-06, + "loss": 1.1008, + "mean_token_accuracy": 0.6707351207733154, + "num_tokens": 115659508.0, + "step": 4558 + }, + { + "epoch": 0.500658906215682, + "grad_norm": 2.2067997455596924, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.7028656005859375, + "num_tokens": 115685517.0, + "step": 4559 + }, + { + "epoch": 0.5007687239182956, + "grad_norm": 2.6865224838256836, + "learning_rate": 1e-06, + "loss": 0.838, + "mean_token_accuracy": 0.7321799993515015, + "num_tokens": 115701789.0, + "step": 4560 + }, + { + "epoch": 0.5008785416209093, + "grad_norm": 2.273062229156494, + "learning_rate": 1e-06, + "loss": 1.0171, + "mean_token_accuracy": 0.6933550238609314, + "num_tokens": 115724866.0, + "step": 4561 + }, + { + "epoch": 0.5009883593235229, + "grad_norm": 2.467236042022705, + "learning_rate": 1e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.7066203355789185, + "num_tokens": 115744572.0, + "step": 4562 + }, + { + "epoch": 0.5010981770261366, + "grad_norm": 2.2017629146575928, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.714084267616272, + "num_tokens": 115768585.0, + "step": 4563 + }, + { + "epoch": 0.5012079947287503, + "grad_norm": 2.485726833343506, + "learning_rate": 1e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.7094751596450806, + "num_tokens": 115788501.0, + "step": 4564 + }, + { + "epoch": 0.501317812431364, + "grad_norm": 2.2029004096984863, + "learning_rate": 1e-06, + "loss": 0.9801, + "mean_token_accuracy": 0.6974164843559265, + "num_tokens": 115813623.0, + "step": 4565 + }, + { + "epoch": 0.5014276301339776, + "grad_norm": 2.5737109184265137, + "learning_rate": 1e-06, + "loss": 1.0291, + "mean_token_accuracy": 0.6915475130081177, + "num_tokens": 115833456.0, + "step": 4566 + }, + { + "epoch": 0.5015374478365913, + "grad_norm": 2.531507730484009, + "learning_rate": 1e-06, + "loss": 1.0064, + "mean_token_accuracy": 0.6945465803146362, + "num_tokens": 115855576.0, + "step": 4567 + }, + { + "epoch": 0.5016472655392049, + "grad_norm": 2.3477230072021484, + "learning_rate": 1e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.7022839784622192, + "num_tokens": 115879637.0, + "step": 4568 + }, + { + "epoch": 0.5017570832418186, + "grad_norm": 2.2768681049346924, + "learning_rate": 1e-06, + "loss": 0.8979, + "mean_token_accuracy": 0.7178227305412292, + "num_tokens": 115903429.0, + "step": 4569 + }, + { + "epoch": 0.5018669009444322, + "grad_norm": 2.354465961456299, + "learning_rate": 1e-06, + "loss": 1.0275, + "mean_token_accuracy": 0.6861569285392761, + "num_tokens": 115926754.0, + "step": 4570 + }, + { + "epoch": 0.5019767186470458, + "grad_norm": 2.237462043762207, + "learning_rate": 1e-06, + "loss": 0.9068, + "mean_token_accuracy": 0.7197927236557007, + "num_tokens": 115951757.0, + "step": 4571 + }, + { + "epoch": 0.5020865363496596, + "grad_norm": 2.2286601066589355, + "learning_rate": 1e-06, + "loss": 1.0458, + "mean_token_accuracy": 0.6851307153701782, + "num_tokens": 115978611.0, + "step": 4572 + }, + { + "epoch": 0.5021963540522733, + "grad_norm": 2.214162826538086, + "learning_rate": 1e-06, + "loss": 1.0004, + "mean_token_accuracy": 0.6927986145019531, + "num_tokens": 116004019.0, + "step": 4573 + }, + { + "epoch": 0.5023061717548869, + "grad_norm": 2.4055185317993164, + "learning_rate": 1e-06, + "loss": 0.9949, + "mean_token_accuracy": 0.6928934454917908, + "num_tokens": 116026576.0, + "step": 4574 + }, + { + "epoch": 0.5024159894575005, + "grad_norm": 2.177776575088501, + "learning_rate": 1e-06, + "loss": 1.0068, + "mean_token_accuracy": 0.6975737810134888, + "num_tokens": 116053761.0, + "step": 4575 + }, + { + "epoch": 0.5025258071601142, + "grad_norm": 2.4434597492218018, + "learning_rate": 1e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.7071899175643921, + "num_tokens": 116076817.0, + "step": 4576 + }, + { + "epoch": 0.5026356248627278, + "grad_norm": 2.2609660625457764, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7024484276771545, + "num_tokens": 116100077.0, + "step": 4577 + }, + { + "epoch": 0.5027454425653415, + "grad_norm": 2.5499629974365234, + "learning_rate": 1e-06, + "loss": 0.8847, + "mean_token_accuracy": 0.7284630537033081, + "num_tokens": 116120232.0, + "step": 4578 + }, + { + "epoch": 0.5028552602679552, + "grad_norm": 2.547144889831543, + "learning_rate": 1e-06, + "loss": 0.9168, + "mean_token_accuracy": 0.7237924337387085, + "num_tokens": 116142222.0, + "step": 4579 + }, + { + "epoch": 0.5029650779705689, + "grad_norm": 2.3079519271850586, + "learning_rate": 1e-06, + "loss": 0.9943, + "mean_token_accuracy": 0.6977007389068604, + "num_tokens": 116165311.0, + "step": 4580 + }, + { + "epoch": 0.5030748956731825, + "grad_norm": 2.0856242179870605, + "learning_rate": 1e-06, + "loss": 0.9205, + "mean_token_accuracy": 0.7264230847358704, + "num_tokens": 116190443.0, + "step": 4581 + }, + { + "epoch": 0.5031847133757962, + "grad_norm": 2.275437593460083, + "learning_rate": 1e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7108607888221741, + "num_tokens": 116214191.0, + "step": 4582 + }, + { + "epoch": 0.5032945310784098, + "grad_norm": 2.3978099822998047, + "learning_rate": 1e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.6998227834701538, + "num_tokens": 116237559.0, + "step": 4583 + }, + { + "epoch": 0.5034043487810235, + "grad_norm": 2.3894333839416504, + "learning_rate": 1e-06, + "loss": 0.9509, + "mean_token_accuracy": 0.7096994519233704, + "num_tokens": 116260026.0, + "step": 4584 + }, + { + "epoch": 0.5035141664836371, + "grad_norm": 1.8985852003097534, + "learning_rate": 1e-06, + "loss": 0.9541, + "mean_token_accuracy": 0.7136324644088745, + "num_tokens": 116291720.0, + "step": 4585 + }, + { + "epoch": 0.5036239841862509, + "grad_norm": 1.9837313890457153, + "learning_rate": 1e-06, + "loss": 1.0833, + "mean_token_accuracy": 0.6792057752609253, + "num_tokens": 116323331.0, + "step": 4586 + }, + { + "epoch": 0.5037338018888645, + "grad_norm": 1.9126564264297485, + "learning_rate": 1e-06, + "loss": 0.9953, + "mean_token_accuracy": 0.6959986686706543, + "num_tokens": 116354529.0, + "step": 4587 + }, + { + "epoch": 0.5038436195914782, + "grad_norm": 2.2225100994110107, + "learning_rate": 1e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.7061963081359863, + "num_tokens": 116379726.0, + "step": 4588 + }, + { + "epoch": 0.5039534372940918, + "grad_norm": 2.192708730697632, + "learning_rate": 1e-06, + "loss": 1.0463, + "mean_token_accuracy": 0.6956313252449036, + "num_tokens": 116406103.0, + "step": 4589 + }, + { + "epoch": 0.5040632549967055, + "grad_norm": 1.8394672870635986, + "learning_rate": 1e-06, + "loss": 1.1414, + "mean_token_accuracy": 0.6728676557540894, + "num_tokens": 116443922.0, + "step": 4590 + }, + { + "epoch": 0.5041730726993191, + "grad_norm": 2.022608518600464, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.7027944922447205, + "num_tokens": 116474462.0, + "step": 4591 + }, + { + "epoch": 0.5042828904019327, + "grad_norm": 2.2100446224212646, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.7141882181167603, + "num_tokens": 116499860.0, + "step": 4592 + }, + { + "epoch": 0.5043927081045465, + "grad_norm": 2.131119728088379, + "learning_rate": 1e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.701579213142395, + "num_tokens": 116527762.0, + "step": 4593 + }, + { + "epoch": 0.5045025258071602, + "grad_norm": 2.4852371215820312, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7120882272720337, + "num_tokens": 116550467.0, + "step": 4594 + }, + { + "epoch": 0.5046123435097738, + "grad_norm": 2.2353360652923584, + "learning_rate": 1e-06, + "loss": 1.0025, + "mean_token_accuracy": 0.6931418180465698, + "num_tokens": 116577466.0, + "step": 4595 + }, + { + "epoch": 0.5047221612123874, + "grad_norm": 2.154627561569214, + "learning_rate": 1e-06, + "loss": 0.9062, + "mean_token_accuracy": 0.7291876077651978, + "num_tokens": 116603792.0, + "step": 4596 + }, + { + "epoch": 0.5048319789150011, + "grad_norm": 2.2157022953033447, + "learning_rate": 1e-06, + "loss": 1.0086, + "mean_token_accuracy": 0.7035672664642334, + "num_tokens": 116629391.0, + "step": 4597 + }, + { + "epoch": 0.5049417966176147, + "grad_norm": 2.282641887664795, + "learning_rate": 1e-06, + "loss": 0.9736, + "mean_token_accuracy": 0.7021408677101135, + "num_tokens": 116652241.0, + "step": 4598 + }, + { + "epoch": 0.5050516143202284, + "grad_norm": 2.1658570766448975, + "learning_rate": 1e-06, + "loss": 0.9931, + "mean_token_accuracy": 0.6984649896621704, + "num_tokens": 116678206.0, + "step": 4599 + }, + { + "epoch": 0.505161432022842, + "grad_norm": 2.295001983642578, + "learning_rate": 1e-06, + "loss": 1.0099, + "mean_token_accuracy": 0.6917778253555298, + "num_tokens": 116703287.0, + "step": 4600 + }, + { + "epoch": 0.5052712497254558, + "grad_norm": 2.449814796447754, + "learning_rate": 1e-06, + "loss": 1.0208, + "mean_token_accuracy": 0.692671537399292, + "num_tokens": 116725428.0, + "step": 4601 + }, + { + "epoch": 0.5053810674280694, + "grad_norm": 2.123750686645508, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.6925591230392456, + "num_tokens": 116751606.0, + "step": 4602 + }, + { + "epoch": 0.5054908851306831, + "grad_norm": 2.1697866916656494, + "learning_rate": 1e-06, + "loss": 1.0091, + "mean_token_accuracy": 0.690862238407135, + "num_tokens": 116778012.0, + "step": 4603 + }, + { + "epoch": 0.5056007028332967, + "grad_norm": 2.3052618503570557, + "learning_rate": 1e-06, + "loss": 0.9138, + "mean_token_accuracy": 0.7158200144767761, + "num_tokens": 116801182.0, + "step": 4604 + }, + { + "epoch": 0.5057105205359104, + "grad_norm": 2.318638563156128, + "learning_rate": 1e-06, + "loss": 0.9205, + "mean_token_accuracy": 0.7222374677658081, + "num_tokens": 116825016.0, + "step": 4605 + }, + { + "epoch": 0.505820338238524, + "grad_norm": 2.2594377994537354, + "learning_rate": 1e-06, + "loss": 1.0234, + "mean_token_accuracy": 0.6894450187683105, + "num_tokens": 116852317.0, + "step": 4606 + }, + { + "epoch": 0.5059301559411377, + "grad_norm": 2.1498148441314697, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7037726044654846, + "num_tokens": 116878205.0, + "step": 4607 + }, + { + "epoch": 0.5060399736437514, + "grad_norm": 2.098431348800659, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.6954530477523804, + "num_tokens": 116904680.0, + "step": 4608 + }, + { + "epoch": 0.5061497913463651, + "grad_norm": 2.273350238800049, + "learning_rate": 1e-06, + "loss": 0.9887, + "mean_token_accuracy": 0.6982320547103882, + "num_tokens": 116930076.0, + "step": 4609 + }, + { + "epoch": 0.5062596090489787, + "grad_norm": 1.987707257270813, + "learning_rate": 1e-06, + "loss": 1.0114, + "mean_token_accuracy": 0.6986083984375, + "num_tokens": 116961636.0, + "step": 4610 + }, + { + "epoch": 0.5063694267515924, + "grad_norm": 2.1877286434173584, + "learning_rate": 1e-06, + "loss": 1.112, + "mean_token_accuracy": 0.6683827638626099, + "num_tokens": 116990137.0, + "step": 4611 + }, + { + "epoch": 0.506479244454206, + "grad_norm": 1.9281731843948364, + "learning_rate": 1e-06, + "loss": 1.0361, + "mean_token_accuracy": 0.693531334400177, + "num_tokens": 117022649.0, + "step": 4612 + }, + { + "epoch": 0.5065890621568196, + "grad_norm": 2.032912254333496, + "learning_rate": 1e-06, + "loss": 0.929, + "mean_token_accuracy": 0.7240605354309082, + "num_tokens": 117050215.0, + "step": 4613 + }, + { + "epoch": 0.5066988798594333, + "grad_norm": 2.211066246032715, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7103114724159241, + "num_tokens": 117075683.0, + "step": 4614 + }, + { + "epoch": 0.506808697562047, + "grad_norm": 2.1882224082946777, + "learning_rate": 1e-06, + "loss": 1.0396, + "mean_token_accuracy": 0.6851015090942383, + "num_tokens": 117102685.0, + "step": 4615 + }, + { + "epoch": 0.5069185152646607, + "grad_norm": 2.2247402667999268, + "learning_rate": 1e-06, + "loss": 1.0645, + "mean_token_accuracy": 0.6842947006225586, + "num_tokens": 117128899.0, + "step": 4616 + }, + { + "epoch": 0.5070283329672743, + "grad_norm": 2.0294344425201416, + "learning_rate": 1e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.6922992467880249, + "num_tokens": 117157203.0, + "step": 4617 + }, + { + "epoch": 0.507138150669888, + "grad_norm": 2.1126492023468018, + "learning_rate": 1e-06, + "loss": 1.0287, + "mean_token_accuracy": 0.6886624097824097, + "num_tokens": 117184705.0, + "step": 4618 + }, + { + "epoch": 0.5072479683725016, + "grad_norm": 2.2397656440734863, + "learning_rate": 1e-06, + "loss": 0.9453, + "mean_token_accuracy": 0.7136190533638, + "num_tokens": 117210372.0, + "step": 4619 + }, + { + "epoch": 0.5073577860751153, + "grad_norm": 2.322364091873169, + "learning_rate": 1e-06, + "loss": 0.8826, + "mean_token_accuracy": 0.7262308597564697, + "num_tokens": 117231318.0, + "step": 4620 + }, + { + "epoch": 0.5074676037777289, + "grad_norm": 2.260084629058838, + "learning_rate": 1e-06, + "loss": 0.9753, + "mean_token_accuracy": 0.7036954760551453, + "num_tokens": 117255185.0, + "step": 4621 + }, + { + "epoch": 0.5075774214803427, + "grad_norm": 2.441910743713379, + "learning_rate": 1e-06, + "loss": 1.0348, + "mean_token_accuracy": 0.6900152564048767, + "num_tokens": 117278085.0, + "step": 4622 + }, + { + "epoch": 0.5076872391829563, + "grad_norm": 2.1124703884124756, + "learning_rate": 1e-06, + "loss": 0.954, + "mean_token_accuracy": 0.7077305316925049, + "num_tokens": 117304868.0, + "step": 4623 + }, + { + "epoch": 0.50779705688557, + "grad_norm": 2.1960134506225586, + "learning_rate": 1e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.706629753112793, + "num_tokens": 117329765.0, + "step": 4624 + }, + { + "epoch": 0.5079068745881836, + "grad_norm": 2.0595180988311768, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.704201877117157, + "num_tokens": 117358900.0, + "step": 4625 + }, + { + "epoch": 0.5080166922907973, + "grad_norm": 2.391444444656372, + "learning_rate": 1e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.6968618035316467, + "num_tokens": 117382080.0, + "step": 4626 + }, + { + "epoch": 0.5081265099934109, + "grad_norm": 2.1441168785095215, + "learning_rate": 1e-06, + "loss": 1.0489, + "mean_token_accuracy": 0.6862853169441223, + "num_tokens": 117409455.0, + "step": 4627 + }, + { + "epoch": 0.5082363276960246, + "grad_norm": 2.0063669681549072, + "learning_rate": 1e-06, + "loss": 1.0667, + "mean_token_accuracy": 0.6800709962844849, + "num_tokens": 117439082.0, + "step": 4628 + }, + { + "epoch": 0.5083461453986382, + "grad_norm": 1.9542597532272339, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7087933421134949, + "num_tokens": 117469384.0, + "step": 4629 + }, + { + "epoch": 0.508455963101252, + "grad_norm": 2.1390743255615234, + "learning_rate": 1e-06, + "loss": 0.9585, + "mean_token_accuracy": 0.7048516273498535, + "num_tokens": 117496062.0, + "step": 4630 + }, + { + "epoch": 0.5085657808038656, + "grad_norm": 2.1475751399993896, + "learning_rate": 1e-06, + "loss": 1.0479, + "mean_token_accuracy": 0.6878560185432434, + "num_tokens": 117524299.0, + "step": 4631 + }, + { + "epoch": 0.5086755985064793, + "grad_norm": 2.2688324451446533, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.7042847871780396, + "num_tokens": 117549501.0, + "step": 4632 + }, + { + "epoch": 0.5087854162090929, + "grad_norm": 2.322636842727661, + "learning_rate": 1e-06, + "loss": 0.9979, + "mean_token_accuracy": 0.7127895951271057, + "num_tokens": 117572801.0, + "step": 4633 + }, + { + "epoch": 0.5088952339117065, + "grad_norm": 2.377013921737671, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.7035327553749084, + "num_tokens": 117596092.0, + "step": 4634 + }, + { + "epoch": 0.5090050516143202, + "grad_norm": 2.235342264175415, + "learning_rate": 1e-06, + "loss": 0.8748, + "mean_token_accuracy": 0.7311177849769592, + "num_tokens": 117619165.0, + "step": 4635 + }, + { + "epoch": 0.5091148693169338, + "grad_norm": 2.370309352874756, + "learning_rate": 1e-06, + "loss": 0.9825, + "mean_token_accuracy": 0.7078039646148682, + "num_tokens": 117641713.0, + "step": 4636 + }, + { + "epoch": 0.5092246870195476, + "grad_norm": 2.226078510284424, + "learning_rate": 1e-06, + "loss": 1.0428, + "mean_token_accuracy": 0.6867737770080566, + "num_tokens": 117667840.0, + "step": 4637 + }, + { + "epoch": 0.5093345047221612, + "grad_norm": 2.0667786598205566, + "learning_rate": 1e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7142089009284973, + "num_tokens": 117693607.0, + "step": 4638 + }, + { + "epoch": 0.5094443224247749, + "grad_norm": 1.956317663192749, + "learning_rate": 1e-06, + "loss": 1.027, + "mean_token_accuracy": 0.6963406801223755, + "num_tokens": 117726597.0, + "step": 4639 + }, + { + "epoch": 0.5095541401273885, + "grad_norm": 2.280622959136963, + "learning_rate": 1e-06, + "loss": 1.0396, + "mean_token_accuracy": 0.6891628503799438, + "num_tokens": 117751695.0, + "step": 4640 + }, + { + "epoch": 0.5096639578300022, + "grad_norm": 2.276268720626831, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.7076930999755859, + "num_tokens": 117776191.0, + "step": 4641 + }, + { + "epoch": 0.5097737755326158, + "grad_norm": 2.150156021118164, + "learning_rate": 1e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.701278805732727, + "num_tokens": 117802936.0, + "step": 4642 + }, + { + "epoch": 0.5098835932352295, + "grad_norm": 1.9239282608032227, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7165766954421997, + "num_tokens": 117834045.0, + "step": 4643 + }, + { + "epoch": 0.5099934109378432, + "grad_norm": 2.4227261543273926, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.713189959526062, + "num_tokens": 117855141.0, + "step": 4644 + }, + { + "epoch": 0.5101032286404569, + "grad_norm": 1.9589275121688843, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7028593420982361, + "num_tokens": 117886572.0, + "step": 4645 + }, + { + "epoch": 0.5102130463430705, + "grad_norm": 2.527961015701294, + "learning_rate": 1e-06, + "loss": 1.0515, + "mean_token_accuracy": 0.6851614713668823, + "num_tokens": 117908116.0, + "step": 4646 + }, + { + "epoch": 0.5103228640456842, + "grad_norm": 2.6129114627838135, + "learning_rate": 1e-06, + "loss": 0.9971, + "mean_token_accuracy": 0.6987728476524353, + "num_tokens": 117928898.0, + "step": 4647 + }, + { + "epoch": 0.5104326817482978, + "grad_norm": 2.005796194076538, + "learning_rate": 1e-06, + "loss": 0.987, + "mean_token_accuracy": 0.7004961371421814, + "num_tokens": 117957406.0, + "step": 4648 + }, + { + "epoch": 0.5105424994509115, + "grad_norm": 2.217785358428955, + "learning_rate": 1e-06, + "loss": 1.0191, + "mean_token_accuracy": 0.6897585391998291, + "num_tokens": 117982952.0, + "step": 4649 + }, + { + "epoch": 0.5106523171535251, + "grad_norm": 2.073880910873413, + "learning_rate": 1e-06, + "loss": 0.9858, + "mean_token_accuracy": 0.7034987807273865, + "num_tokens": 118009899.0, + "step": 4650 + }, + { + "epoch": 0.5107621348561389, + "grad_norm": 2.28596830368042, + "learning_rate": 1e-06, + "loss": 0.8751, + "mean_token_accuracy": 0.731277346611023, + "num_tokens": 118031862.0, + "step": 4651 + }, + { + "epoch": 0.5108719525587525, + "grad_norm": 2.43619441986084, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.7004317045211792, + "num_tokens": 118053982.0, + "step": 4652 + }, + { + "epoch": 0.5109817702613662, + "grad_norm": 2.2441091537475586, + "learning_rate": 1e-06, + "loss": 0.9774, + "mean_token_accuracy": 0.7041388154029846, + "num_tokens": 118078760.0, + "step": 4653 + }, + { + "epoch": 0.5110915879639798, + "grad_norm": 2.104966640472412, + "learning_rate": 1e-06, + "loss": 0.9191, + "mean_token_accuracy": 0.7144045829772949, + "num_tokens": 118104611.0, + "step": 4654 + }, + { + "epoch": 0.5112014056665934, + "grad_norm": 2.094437599182129, + "learning_rate": 1e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.7226865291595459, + "num_tokens": 118130092.0, + "step": 4655 + }, + { + "epoch": 0.5113112233692071, + "grad_norm": 1.8694995641708374, + "learning_rate": 1e-06, + "loss": 0.8827, + "mean_token_accuracy": 0.7271658778190613, + "num_tokens": 118159753.0, + "step": 4656 + }, + { + "epoch": 0.5114210410718207, + "grad_norm": 1.896031379699707, + "learning_rate": 1e-06, + "loss": 1.0225, + "mean_token_accuracy": 0.699414074420929, + "num_tokens": 118192355.0, + "step": 4657 + }, + { + "epoch": 0.5115308587744345, + "grad_norm": 2.163163661956787, + "learning_rate": 1e-06, + "loss": 1.0328, + "mean_token_accuracy": 0.6846868991851807, + "num_tokens": 118219707.0, + "step": 4658 + }, + { + "epoch": 0.5116406764770481, + "grad_norm": 2.6164329051971436, + "learning_rate": 1e-06, + "loss": 0.9776, + "mean_token_accuracy": 0.7036136388778687, + "num_tokens": 118239118.0, + "step": 4659 + }, + { + "epoch": 0.5117504941796618, + "grad_norm": 2.075939416885376, + "learning_rate": 1e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.6916844844818115, + "num_tokens": 118269557.0, + "step": 4660 + }, + { + "epoch": 0.5118603118822754, + "grad_norm": 1.976689100265503, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.70997154712677, + "num_tokens": 118298696.0, + "step": 4661 + }, + { + "epoch": 0.5119701295848891, + "grad_norm": 2.1058616638183594, + "learning_rate": 1e-06, + "loss": 1.0347, + "mean_token_accuracy": 0.693454384803772, + "num_tokens": 118326904.0, + "step": 4662 + }, + { + "epoch": 0.5120799472875027, + "grad_norm": 1.991960883140564, + "learning_rate": 1e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.6984277963638306, + "num_tokens": 118357810.0, + "step": 4663 + }, + { + "epoch": 0.5121897649901164, + "grad_norm": 2.3133299350738525, + "learning_rate": 1e-06, + "loss": 0.9268, + "mean_token_accuracy": 0.715680718421936, + "num_tokens": 118380265.0, + "step": 4664 + }, + { + "epoch": 0.51229958269273, + "grad_norm": 2.3971645832061768, + "learning_rate": 1e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.7070016860961914, + "num_tokens": 118402393.0, + "step": 4665 + }, + { + "epoch": 0.5124094003953438, + "grad_norm": 1.9814456701278687, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.7015464305877686, + "num_tokens": 118433430.0, + "step": 4666 + }, + { + "epoch": 0.5125192180979574, + "grad_norm": 2.369018077850342, + "learning_rate": 1e-06, + "loss": 1.0113, + "mean_token_accuracy": 0.7094311714172363, + "num_tokens": 118457438.0, + "step": 4667 + }, + { + "epoch": 0.5126290358005711, + "grad_norm": 2.0771007537841797, + "learning_rate": 1e-06, + "loss": 0.8928, + "mean_token_accuracy": 0.714859664440155, + "num_tokens": 118483700.0, + "step": 4668 + }, + { + "epoch": 0.5127388535031847, + "grad_norm": 2.5491933822631836, + "learning_rate": 1e-06, + "loss": 0.8421, + "mean_token_accuracy": 0.7346792221069336, + "num_tokens": 118501545.0, + "step": 4669 + }, + { + "epoch": 0.5128486712057984, + "grad_norm": 2.179457664489746, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7092467546463013, + "num_tokens": 118527997.0, + "step": 4670 + }, + { + "epoch": 0.512958488908412, + "grad_norm": 2.3189454078674316, + "learning_rate": 1e-06, + "loss": 0.9695, + "mean_token_accuracy": 0.7025967240333557, + "num_tokens": 118551019.0, + "step": 4671 + }, + { + "epoch": 0.5130683066110256, + "grad_norm": 2.6447579860687256, + "learning_rate": 1e-06, + "loss": 0.987, + "mean_token_accuracy": 0.6980792880058289, + "num_tokens": 118572259.0, + "step": 4672 + }, + { + "epoch": 0.5131781243136394, + "grad_norm": 2.3515264987945557, + "learning_rate": 1e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.7074167132377625, + "num_tokens": 118595491.0, + "step": 4673 + }, + { + "epoch": 0.513287942016253, + "grad_norm": 2.1407721042633057, + "learning_rate": 1e-06, + "loss": 1.0054, + "mean_token_accuracy": 0.6937359571456909, + "num_tokens": 118623156.0, + "step": 4674 + }, + { + "epoch": 0.5133977597188667, + "grad_norm": 2.471374750137329, + "learning_rate": 1e-06, + "loss": 0.9135, + "mean_token_accuracy": 0.7168651819229126, + "num_tokens": 118643848.0, + "step": 4675 + }, + { + "epoch": 0.5135075774214803, + "grad_norm": 2.395481586456299, + "learning_rate": 1e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7150530815124512, + "num_tokens": 118665981.0, + "step": 4676 + }, + { + "epoch": 0.513617395124094, + "grad_norm": 1.9685275554656982, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.7147387266159058, + "num_tokens": 118695225.0, + "step": 4677 + }, + { + "epoch": 0.5137272128267076, + "grad_norm": 2.2905678749084473, + "learning_rate": 1e-06, + "loss": 0.8691, + "mean_token_accuracy": 0.7298643589019775, + "num_tokens": 118718657.0, + "step": 4678 + }, + { + "epoch": 0.5138370305293213, + "grad_norm": 2.832937002182007, + "learning_rate": 1e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.7158077955245972, + "num_tokens": 118735184.0, + "step": 4679 + }, + { + "epoch": 0.513946848231935, + "grad_norm": 2.664785385131836, + "learning_rate": 1e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7127066850662231, + "num_tokens": 118754507.0, + "step": 4680 + }, + { + "epoch": 0.5140566659345487, + "grad_norm": 2.50345516204834, + "learning_rate": 1e-06, + "loss": 0.9068, + "mean_token_accuracy": 0.722274661064148, + "num_tokens": 118773670.0, + "step": 4681 + }, + { + "epoch": 0.5141664836371623, + "grad_norm": 2.4566891193389893, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.709926962852478, + "num_tokens": 118794635.0, + "step": 4682 + }, + { + "epoch": 0.514276301339776, + "grad_norm": 2.089214324951172, + "learning_rate": 1e-06, + "loss": 1.0138, + "mean_token_accuracy": 0.706937313079834, + "num_tokens": 118821949.0, + "step": 4683 + }, + { + "epoch": 0.5143861190423896, + "grad_norm": 2.15999436378479, + "learning_rate": 1e-06, + "loss": 0.953, + "mean_token_accuracy": 0.7163714170455933, + "num_tokens": 118848118.0, + "step": 4684 + }, + { + "epoch": 0.5144959367450033, + "grad_norm": 2.13427996635437, + "learning_rate": 1e-06, + "loss": 0.8782, + "mean_token_accuracy": 0.7292534112930298, + "num_tokens": 118873865.0, + "step": 4685 + }, + { + "epoch": 0.5146057544476169, + "grad_norm": 2.0329508781433105, + "learning_rate": 1e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.7076348066329956, + "num_tokens": 118902489.0, + "step": 4686 + }, + { + "epoch": 0.5147155721502307, + "grad_norm": 2.3885927200317383, + "learning_rate": 1e-06, + "loss": 1.0221, + "mean_token_accuracy": 0.698230504989624, + "num_tokens": 118924274.0, + "step": 4687 + }, + { + "epoch": 0.5148253898528443, + "grad_norm": 2.65818452835083, + "learning_rate": 1e-06, + "loss": 1.0233, + "mean_token_accuracy": 0.6922702193260193, + "num_tokens": 118944372.0, + "step": 4688 + }, + { + "epoch": 0.514935207555458, + "grad_norm": 2.195467948913574, + "learning_rate": 1e-06, + "loss": 1.0255, + "mean_token_accuracy": 0.6928702592849731, + "num_tokens": 118969655.0, + "step": 4689 + }, + { + "epoch": 0.5150450252580716, + "grad_norm": 2.2017416954040527, + "learning_rate": 1e-06, + "loss": 0.9511, + "mean_token_accuracy": 0.7192264795303345, + "num_tokens": 118994570.0, + "step": 4690 + }, + { + "epoch": 0.5151548429606853, + "grad_norm": 2.083381414413452, + "learning_rate": 1e-06, + "loss": 0.9358, + "mean_token_accuracy": 0.7122505903244019, + "num_tokens": 119021950.0, + "step": 4691 + }, + { + "epoch": 0.5152646606632989, + "grad_norm": 1.9884597063064575, + "learning_rate": 1e-06, + "loss": 1.0617, + "mean_token_accuracy": 0.6748366951942444, + "num_tokens": 119052042.0, + "step": 4692 + }, + { + "epoch": 0.5153744783659125, + "grad_norm": 2.420781373977661, + "learning_rate": 1e-06, + "loss": 1.06, + "mean_token_accuracy": 0.6818172335624695, + "num_tokens": 119075448.0, + "step": 4693 + }, + { + "epoch": 0.5154842960685262, + "grad_norm": 2.5472004413604736, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7215269804000854, + "num_tokens": 119094802.0, + "step": 4694 + }, + { + "epoch": 0.51559411377114, + "grad_norm": 2.3958489894866943, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.6959362030029297, + "num_tokens": 119118515.0, + "step": 4695 + }, + { + "epoch": 0.5157039314737536, + "grad_norm": 2.2953922748565674, + "learning_rate": 1e-06, + "loss": 0.8832, + "mean_token_accuracy": 0.7277096509933472, + "num_tokens": 119140642.0, + "step": 4696 + }, + { + "epoch": 0.5158137491763672, + "grad_norm": 2.2298309803009033, + "learning_rate": 1e-06, + "loss": 0.8984, + "mean_token_accuracy": 0.7216564416885376, + "num_tokens": 119165451.0, + "step": 4697 + }, + { + "epoch": 0.5159235668789809, + "grad_norm": 2.2239458560943604, + "learning_rate": 1e-06, + "loss": 0.8644, + "mean_token_accuracy": 0.7356857657432556, + "num_tokens": 119189443.0, + "step": 4698 + }, + { + "epoch": 0.5160333845815945, + "grad_norm": 1.97904372215271, + "learning_rate": 1e-06, + "loss": 1.0179, + "mean_token_accuracy": 0.6904167532920837, + "num_tokens": 119222184.0, + "step": 4699 + }, + { + "epoch": 0.5161432022842082, + "grad_norm": 2.1130282878875732, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7215070724487305, + "num_tokens": 119248496.0, + "step": 4700 + }, + { + "epoch": 0.5162530199868218, + "grad_norm": 2.221832513809204, + "learning_rate": 1e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.7016369104385376, + "num_tokens": 119274227.0, + "step": 4701 + }, + { + "epoch": 0.5163628376894356, + "grad_norm": 2.1264467239379883, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7045336961746216, + "num_tokens": 119301604.0, + "step": 4702 + }, + { + "epoch": 0.5164726553920492, + "grad_norm": 2.1832807064056396, + "learning_rate": 1e-06, + "loss": 0.93, + "mean_token_accuracy": 0.7212653160095215, + "num_tokens": 119327762.0, + "step": 4703 + }, + { + "epoch": 0.5165824730946629, + "grad_norm": 1.8914635181427002, + "learning_rate": 1e-06, + "loss": 1.0257, + "mean_token_accuracy": 0.6970112323760986, + "num_tokens": 119359830.0, + "step": 4704 + }, + { + "epoch": 0.5166922907972765, + "grad_norm": 2.2429988384246826, + "learning_rate": 1e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.7151767015457153, + "num_tokens": 119382874.0, + "step": 4705 + }, + { + "epoch": 0.5168021084998902, + "grad_norm": 2.125140428543091, + "learning_rate": 1e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.7092087864875793, + "num_tokens": 119409047.0, + "step": 4706 + }, + { + "epoch": 0.5169119262025038, + "grad_norm": 2.356374502182007, + "learning_rate": 1e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.7151554822921753, + "num_tokens": 119431482.0, + "step": 4707 + }, + { + "epoch": 0.5170217439051175, + "grad_norm": 2.6469242572784424, + "learning_rate": 1e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.715381383895874, + "num_tokens": 119453039.0, + "step": 4708 + }, + { + "epoch": 0.5171315616077312, + "grad_norm": 2.3262767791748047, + "learning_rate": 1e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.7194498777389526, + "num_tokens": 119474730.0, + "step": 4709 + }, + { + "epoch": 0.5172413793103449, + "grad_norm": 2.446732997894287, + "learning_rate": 1e-06, + "loss": 0.8896, + "mean_token_accuracy": 0.7265446186065674, + "num_tokens": 119494211.0, + "step": 4710 + }, + { + "epoch": 0.5173511970129585, + "grad_norm": 2.1107821464538574, + "learning_rate": 1e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.7231074571609497, + "num_tokens": 119521244.0, + "step": 4711 + }, + { + "epoch": 0.5174610147155722, + "grad_norm": 2.489790201187134, + "learning_rate": 1e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.7035001516342163, + "num_tokens": 119542284.0, + "step": 4712 + }, + { + "epoch": 0.5175708324181858, + "grad_norm": 2.369157075881958, + "learning_rate": 1e-06, + "loss": 0.8847, + "mean_token_accuracy": 0.722732424736023, + "num_tokens": 119563402.0, + "step": 4713 + }, + { + "epoch": 0.5176806501207994, + "grad_norm": 2.175398588180542, + "learning_rate": 1e-06, + "loss": 0.9773, + "mean_token_accuracy": 0.6933355331420898, + "num_tokens": 119588510.0, + "step": 4714 + }, + { + "epoch": 0.5177904678234131, + "grad_norm": 2.2625584602355957, + "learning_rate": 1e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.7033473253250122, + "num_tokens": 119612929.0, + "step": 4715 + }, + { + "epoch": 0.5179002855260268, + "grad_norm": 2.3641271591186523, + "learning_rate": 1e-06, + "loss": 0.9701, + "mean_token_accuracy": 0.7020068168640137, + "num_tokens": 119636045.0, + "step": 4716 + }, + { + "epoch": 0.5180101032286405, + "grad_norm": 2.1003735065460205, + "learning_rate": 1e-06, + "loss": 1.0357, + "mean_token_accuracy": 0.689976692199707, + "num_tokens": 119664662.0, + "step": 4717 + }, + { + "epoch": 0.5181199209312541, + "grad_norm": 2.5683436393737793, + "learning_rate": 1e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.7078293561935425, + "num_tokens": 119685967.0, + "step": 4718 + }, + { + "epoch": 0.5182297386338678, + "grad_norm": 2.014410972595215, + "learning_rate": 1e-06, + "loss": 0.997, + "mean_token_accuracy": 0.7034516334533691, + "num_tokens": 119716627.0, + "step": 4719 + }, + { + "epoch": 0.5183395563364814, + "grad_norm": 2.08475399017334, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.7052314281463623, + "num_tokens": 119743368.0, + "step": 4720 + }, + { + "epoch": 0.5184493740390951, + "grad_norm": 1.826948642730713, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.705435037612915, + "num_tokens": 119774847.0, + "step": 4721 + }, + { + "epoch": 0.5185591917417087, + "grad_norm": 2.129063367843628, + "learning_rate": 1e-06, + "loss": 0.9494, + "mean_token_accuracy": 0.7238861918449402, + "num_tokens": 119804022.0, + "step": 4722 + }, + { + "epoch": 0.5186690094443224, + "grad_norm": 2.1254336833953857, + "learning_rate": 1e-06, + "loss": 0.9948, + "mean_token_accuracy": 0.7007186412811279, + "num_tokens": 119830761.0, + "step": 4723 + }, + { + "epoch": 0.5187788271469361, + "grad_norm": 2.1864583492279053, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.6982171535491943, + "num_tokens": 119856012.0, + "step": 4724 + }, + { + "epoch": 0.5188886448495498, + "grad_norm": 2.044930934906006, + "learning_rate": 1e-06, + "loss": 1.0147, + "mean_token_accuracy": 0.6913021206855774, + "num_tokens": 119885662.0, + "step": 4725 + }, + { + "epoch": 0.5189984625521634, + "grad_norm": 2.351041793823242, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7022565007209778, + "num_tokens": 119908264.0, + "step": 4726 + }, + { + "epoch": 0.5191082802547771, + "grad_norm": 2.4468789100646973, + "learning_rate": 1e-06, + "loss": 0.8844, + "mean_token_accuracy": 0.7199925184249878, + "num_tokens": 119928859.0, + "step": 4727 + }, + { + "epoch": 0.5192180979573907, + "grad_norm": 2.0944912433624268, + "learning_rate": 1e-06, + "loss": 1.0181, + "mean_token_accuracy": 0.6963912844657898, + "num_tokens": 119957744.0, + "step": 4728 + }, + { + "epoch": 0.5193279156600044, + "grad_norm": 2.2360377311706543, + "learning_rate": 1e-06, + "loss": 1.0165, + "mean_token_accuracy": 0.6960605978965759, + "num_tokens": 119982984.0, + "step": 4729 + }, + { + "epoch": 0.519437733362618, + "grad_norm": 2.395737409591675, + "learning_rate": 1e-06, + "loss": 0.9687, + "mean_token_accuracy": 0.7187443971633911, + "num_tokens": 120004050.0, + "step": 4730 + }, + { + "epoch": 0.5195475510652318, + "grad_norm": 2.2013182640075684, + "learning_rate": 1e-06, + "loss": 0.904, + "mean_token_accuracy": 0.7291958928108215, + "num_tokens": 120028453.0, + "step": 4731 + }, + { + "epoch": 0.5196573687678454, + "grad_norm": 2.17392897605896, + "learning_rate": 1e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.714604377746582, + "num_tokens": 120055332.0, + "step": 4732 + }, + { + "epoch": 0.519767186470459, + "grad_norm": 2.069859266281128, + "learning_rate": 1e-06, + "loss": 1.0199, + "mean_token_accuracy": 0.6948201060295105, + "num_tokens": 120081567.0, + "step": 4733 + }, + { + "epoch": 0.5198770041730727, + "grad_norm": 2.1621174812316895, + "learning_rate": 1e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.7270190119743347, + "num_tokens": 120108009.0, + "step": 4734 + }, + { + "epoch": 0.5199868218756863, + "grad_norm": 2.314352512359619, + "learning_rate": 1e-06, + "loss": 1.1052, + "mean_token_accuracy": 0.6688640117645264, + "num_tokens": 120130998.0, + "step": 4735 + }, + { + "epoch": 0.5200966395783, + "grad_norm": 2.1319055557250977, + "learning_rate": 1e-06, + "loss": 1.0154, + "mean_token_accuracy": 0.6881448030471802, + "num_tokens": 120159738.0, + "step": 4736 + }, + { + "epoch": 0.5202064572809136, + "grad_norm": 2.2302212715148926, + "learning_rate": 1e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7151637077331543, + "num_tokens": 120182830.0, + "step": 4737 + }, + { + "epoch": 0.5203162749835274, + "grad_norm": 2.1172399520874023, + "learning_rate": 1e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.7151530981063843, + "num_tokens": 120208899.0, + "step": 4738 + }, + { + "epoch": 0.520426092686141, + "grad_norm": 2.2549664974212646, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.6945725679397583, + "num_tokens": 120234295.0, + "step": 4739 + }, + { + "epoch": 0.5205359103887547, + "grad_norm": 2.312373638153076, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7025129795074463, + "num_tokens": 120258942.0, + "step": 4740 + }, + { + "epoch": 0.5206457280913683, + "grad_norm": 2.136225700378418, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.719096302986145, + "num_tokens": 120284380.0, + "step": 4741 + }, + { + "epoch": 0.520755545793982, + "grad_norm": 2.2947094440460205, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7124834060668945, + "num_tokens": 120306970.0, + "step": 4742 + }, + { + "epoch": 0.5208653634965956, + "grad_norm": 2.025479793548584, + "learning_rate": 1e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7171590328216553, + "num_tokens": 120337524.0, + "step": 4743 + }, + { + "epoch": 0.5209751811992093, + "grad_norm": 2.3380966186523438, + "learning_rate": 1e-06, + "loss": 1.0895, + "mean_token_accuracy": 0.6766201257705688, + "num_tokens": 120362170.0, + "step": 4744 + }, + { + "epoch": 0.521084998901823, + "grad_norm": 2.1551878452301025, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.713058352470398, + "num_tokens": 120387604.0, + "step": 4745 + }, + { + "epoch": 0.5211948166044367, + "grad_norm": 2.2661056518554688, + "learning_rate": 1e-06, + "loss": 1.0694, + "mean_token_accuracy": 0.6874848008155823, + "num_tokens": 120412115.0, + "step": 4746 + }, + { + "epoch": 0.5213046343070503, + "grad_norm": 2.2753713130950928, + "learning_rate": 1e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.7184387445449829, + "num_tokens": 120434894.0, + "step": 4747 + }, + { + "epoch": 0.521414452009664, + "grad_norm": 2.3341526985168457, + "learning_rate": 1e-06, + "loss": 0.977, + "mean_token_accuracy": 0.7007244825363159, + "num_tokens": 120457582.0, + "step": 4748 + }, + { + "epoch": 0.5215242697122776, + "grad_norm": 1.8814456462860107, + "learning_rate": 1e-06, + "loss": 1.0313, + "mean_token_accuracy": 0.6907418966293335, + "num_tokens": 120492275.0, + "step": 4749 + }, + { + "epoch": 0.5216340874148913, + "grad_norm": 2.4412155151367188, + "learning_rate": 1e-06, + "loss": 0.9832, + "mean_token_accuracy": 0.7060624361038208, + "num_tokens": 120512212.0, + "step": 4750 + }, + { + "epoch": 0.5217439051175049, + "grad_norm": 2.2739408016204834, + "learning_rate": 1e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.7132035493850708, + "num_tokens": 120535659.0, + "step": 4751 + }, + { + "epoch": 0.5218537228201185, + "grad_norm": 2.3040857315063477, + "learning_rate": 1e-06, + "loss": 0.9117, + "mean_token_accuracy": 0.7187470197677612, + "num_tokens": 120559634.0, + "step": 4752 + }, + { + "epoch": 0.5219635405227323, + "grad_norm": 2.1178088188171387, + "learning_rate": 1e-06, + "loss": 1.0271, + "mean_token_accuracy": 0.6894787549972534, + "num_tokens": 120587854.0, + "step": 4753 + }, + { + "epoch": 0.522073358225346, + "grad_norm": 2.0394175052642822, + "learning_rate": 1e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.703295111656189, + "num_tokens": 120618072.0, + "step": 4754 + }, + { + "epoch": 0.5221831759279596, + "grad_norm": 2.041994571685791, + "learning_rate": 1e-06, + "loss": 1.0341, + "mean_token_accuracy": 0.6894859671592712, + "num_tokens": 120650686.0, + "step": 4755 + }, + { + "epoch": 0.5222929936305732, + "grad_norm": 2.27097225189209, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7009507417678833, + "num_tokens": 120674537.0, + "step": 4756 + }, + { + "epoch": 0.5224028113331869, + "grad_norm": 2.0962846279144287, + "learning_rate": 1e-06, + "loss": 1.0845, + "mean_token_accuracy": 0.673280656337738, + "num_tokens": 120704508.0, + "step": 4757 + }, + { + "epoch": 0.5225126290358005, + "grad_norm": 2.054952621459961, + "learning_rate": 1e-06, + "loss": 0.9806, + "mean_token_accuracy": 0.6919387578964233, + "num_tokens": 120731830.0, + "step": 4758 + }, + { + "epoch": 0.5226224467384142, + "grad_norm": 1.944632887840271, + "learning_rate": 1e-06, + "loss": 1.0732, + "mean_token_accuracy": 0.6806846857070923, + "num_tokens": 120766265.0, + "step": 4759 + }, + { + "epoch": 0.5227322644410279, + "grad_norm": 2.088759422302246, + "learning_rate": 1e-06, + "loss": 0.8919, + "mean_token_accuracy": 0.7307872772216797, + "num_tokens": 120791996.0, + "step": 4760 + }, + { + "epoch": 0.5228420821436416, + "grad_norm": 2.0268783569335938, + "learning_rate": 1e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.7110470533370972, + "num_tokens": 120820404.0, + "step": 4761 + }, + { + "epoch": 0.5229518998462552, + "grad_norm": 2.2919037342071533, + "learning_rate": 1e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.7194424867630005, + "num_tokens": 120844728.0, + "step": 4762 + }, + { + "epoch": 0.5230617175488689, + "grad_norm": 2.1920859813690186, + "learning_rate": 1e-06, + "loss": 0.8821, + "mean_token_accuracy": 0.7309505343437195, + "num_tokens": 120867824.0, + "step": 4763 + }, + { + "epoch": 0.5231715352514825, + "grad_norm": 2.3717689514160156, + "learning_rate": 1e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7183619141578674, + "num_tokens": 120890480.0, + "step": 4764 + }, + { + "epoch": 0.5232813529540962, + "grad_norm": 2.475839376449585, + "learning_rate": 1e-06, + "loss": 0.9109, + "mean_token_accuracy": 0.7203324437141418, + "num_tokens": 120909788.0, + "step": 4765 + }, + { + "epoch": 0.5233911706567098, + "grad_norm": 2.201509952545166, + "learning_rate": 1e-06, + "loss": 0.9933, + "mean_token_accuracy": 0.6979838609695435, + "num_tokens": 120935970.0, + "step": 4766 + }, + { + "epoch": 0.5235009883593236, + "grad_norm": 2.1247689723968506, + "learning_rate": 1e-06, + "loss": 0.9595, + "mean_token_accuracy": 0.7014985680580139, + "num_tokens": 120963463.0, + "step": 4767 + }, + { + "epoch": 0.5236108060619372, + "grad_norm": 2.570685863494873, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.7038244009017944, + "num_tokens": 120982971.0, + "step": 4768 + }, + { + "epoch": 0.5237206237645509, + "grad_norm": 2.310443162918091, + "learning_rate": 1e-06, + "loss": 0.883, + "mean_token_accuracy": 0.722337007522583, + "num_tokens": 121006735.0, + "step": 4769 + }, + { + "epoch": 0.5238304414671645, + "grad_norm": 1.9759525060653687, + "learning_rate": 1e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.7110652923583984, + "num_tokens": 121036410.0, + "step": 4770 + }, + { + "epoch": 0.5239402591697782, + "grad_norm": 2.24123215675354, + "learning_rate": 1e-06, + "loss": 0.8511, + "mean_token_accuracy": 0.7354041337966919, + "num_tokens": 121059752.0, + "step": 4771 + }, + { + "epoch": 0.5240500768723918, + "grad_norm": 1.8901795148849487, + "learning_rate": 1e-06, + "loss": 0.8831, + "mean_token_accuracy": 0.7262609004974365, + "num_tokens": 121090634.0, + "step": 4772 + }, + { + "epoch": 0.5241598945750054, + "grad_norm": 2.2108633518218994, + "learning_rate": 1e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.7202355861663818, + "num_tokens": 121112628.0, + "step": 4773 + }, + { + "epoch": 0.5242697122776192, + "grad_norm": 2.149602174758911, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7083362936973572, + "num_tokens": 121137771.0, + "step": 4774 + }, + { + "epoch": 0.5243795299802329, + "grad_norm": 2.3744282722473145, + "learning_rate": 1e-06, + "loss": 0.982, + "mean_token_accuracy": 0.6970635652542114, + "num_tokens": 121164433.0, + "step": 4775 + }, + { + "epoch": 0.5244893476828465, + "grad_norm": 2.0339536666870117, + "learning_rate": 1e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.6945748329162598, + "num_tokens": 121191634.0, + "step": 4776 + }, + { + "epoch": 0.5245991653854601, + "grad_norm": 2.333281993865967, + "learning_rate": 1e-06, + "loss": 0.8674, + "mean_token_accuracy": 0.7347402572631836, + "num_tokens": 121213159.0, + "step": 4777 + }, + { + "epoch": 0.5247089830880738, + "grad_norm": 2.195065975189209, + "learning_rate": 1e-06, + "loss": 0.8159, + "mean_token_accuracy": 0.7401711940765381, + "num_tokens": 121237202.0, + "step": 4778 + }, + { + "epoch": 0.5248188007906874, + "grad_norm": 2.7422900199890137, + "learning_rate": 1e-06, + "loss": 0.9951, + "mean_token_accuracy": 0.6926937699317932, + "num_tokens": 121256828.0, + "step": 4779 + }, + { + "epoch": 0.5249286184933011, + "grad_norm": 2.1033899784088135, + "learning_rate": 1e-06, + "loss": 0.9847, + "mean_token_accuracy": 0.7073793411254883, + "num_tokens": 121284409.0, + "step": 4780 + }, + { + "epoch": 0.5250384361959147, + "grad_norm": 2.1448183059692383, + "learning_rate": 1e-06, + "loss": 1.0034, + "mean_token_accuracy": 0.693875789642334, + "num_tokens": 121312222.0, + "step": 4781 + }, + { + "epoch": 0.5251482538985285, + "grad_norm": 2.261993885040283, + "learning_rate": 1e-06, + "loss": 1.0183, + "mean_token_accuracy": 0.7044646143913269, + "num_tokens": 121336765.0, + "step": 4782 + }, + { + "epoch": 0.5252580716011421, + "grad_norm": 2.177480936050415, + "learning_rate": 1e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.6952831745147705, + "num_tokens": 121364488.0, + "step": 4783 + }, + { + "epoch": 0.5253678893037558, + "grad_norm": 2.0629680156707764, + "learning_rate": 1e-06, + "loss": 1.069, + "mean_token_accuracy": 0.6814247369766235, + "num_tokens": 121397015.0, + "step": 4784 + }, + { + "epoch": 0.5254777070063694, + "grad_norm": 2.4210596084594727, + "learning_rate": 1e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.7209941744804382, + "num_tokens": 121416742.0, + "step": 4785 + }, + { + "epoch": 0.5255875247089831, + "grad_norm": 2.3091373443603516, + "learning_rate": 1e-06, + "loss": 1.0116, + "mean_token_accuracy": 0.6951191425323486, + "num_tokens": 121440408.0, + "step": 4786 + }, + { + "epoch": 0.5256973424115967, + "grad_norm": 2.285956859588623, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.6943415403366089, + "num_tokens": 121462903.0, + "step": 4787 + }, + { + "epoch": 0.5258071601142104, + "grad_norm": 2.3800480365753174, + "learning_rate": 1e-06, + "loss": 0.907, + "mean_token_accuracy": 0.7152043581008911, + "num_tokens": 121485313.0, + "step": 4788 + }, + { + "epoch": 0.5259169778168241, + "grad_norm": 2.3275928497314453, + "learning_rate": 1e-06, + "loss": 0.8861, + "mean_token_accuracy": 0.727898120880127, + "num_tokens": 121507675.0, + "step": 4789 + }, + { + "epoch": 0.5260267955194378, + "grad_norm": 2.3742711544036865, + "learning_rate": 1e-06, + "loss": 0.9782, + "mean_token_accuracy": 0.7040262222290039, + "num_tokens": 121529434.0, + "step": 4790 + }, + { + "epoch": 0.5261366132220514, + "grad_norm": 2.318946599960327, + "learning_rate": 1e-06, + "loss": 0.9722, + "mean_token_accuracy": 0.7054232954978943, + "num_tokens": 121554650.0, + "step": 4791 + }, + { + "epoch": 0.526246430924665, + "grad_norm": 2.3362815380096436, + "learning_rate": 1e-06, + "loss": 1.0165, + "mean_token_accuracy": 0.6968288421630859, + "num_tokens": 121579475.0, + "step": 4792 + }, + { + "epoch": 0.5263562486272787, + "grad_norm": 2.3581175804138184, + "learning_rate": 1e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.7087253928184509, + "num_tokens": 121601868.0, + "step": 4793 + }, + { + "epoch": 0.5264660663298923, + "grad_norm": 2.1372005939483643, + "learning_rate": 1e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.7016390562057495, + "num_tokens": 121628069.0, + "step": 4794 + }, + { + "epoch": 0.526575884032506, + "grad_norm": 2.019836902618408, + "learning_rate": 1e-06, + "loss": 0.9942, + "mean_token_accuracy": 0.6988296508789062, + "num_tokens": 121655734.0, + "step": 4795 + }, + { + "epoch": 0.5266857017351197, + "grad_norm": 2.1501388549804688, + "learning_rate": 1e-06, + "loss": 1.0477, + "mean_token_accuracy": 0.6796174049377441, + "num_tokens": 121681485.0, + "step": 4796 + }, + { + "epoch": 0.5267955194377334, + "grad_norm": 2.202960252761841, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.7165284752845764, + "num_tokens": 121705095.0, + "step": 4797 + }, + { + "epoch": 0.526905337140347, + "grad_norm": 2.2586748600006104, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7196038961410522, + "num_tokens": 121727646.0, + "step": 4798 + }, + { + "epoch": 0.5270151548429607, + "grad_norm": 2.234727144241333, + "learning_rate": 1e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.698689877986908, + "num_tokens": 121754651.0, + "step": 4799 + }, + { + "epoch": 0.5271249725455743, + "grad_norm": 2.5959548950195312, + "learning_rate": 1e-06, + "loss": 0.9048, + "mean_token_accuracy": 0.7111361026763916, + "num_tokens": 121773628.0, + "step": 4800 + }, + { + "epoch": 0.527234790248188, + "grad_norm": 2.1067798137664795, + "learning_rate": 1e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.6927565336227417, + "num_tokens": 121798741.0, + "step": 4801 + }, + { + "epoch": 0.5273446079508016, + "grad_norm": 2.4061591625213623, + "learning_rate": 1e-06, + "loss": 1.0164, + "mean_token_accuracy": 0.7018872499465942, + "num_tokens": 121821914.0, + "step": 4802 + }, + { + "epoch": 0.5274544256534154, + "grad_norm": 2.303041934967041, + "learning_rate": 1e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.695855975151062, + "num_tokens": 121845556.0, + "step": 4803 + }, + { + "epoch": 0.527564243356029, + "grad_norm": 2.1148152351379395, + "learning_rate": 1e-06, + "loss": 1.0425, + "mean_token_accuracy": 0.6798562407493591, + "num_tokens": 121871795.0, + "step": 4804 + }, + { + "epoch": 0.5276740610586427, + "grad_norm": 2.147179365158081, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7168444395065308, + "num_tokens": 121897602.0, + "step": 4805 + }, + { + "epoch": 0.5277838787612563, + "grad_norm": 2.35764741897583, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.7180768847465515, + "num_tokens": 121921678.0, + "step": 4806 + }, + { + "epoch": 0.52789369646387, + "grad_norm": 2.158135414123535, + "learning_rate": 1e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.7119752168655396, + "num_tokens": 121952670.0, + "step": 4807 + }, + { + "epoch": 0.5280035141664836, + "grad_norm": 2.386173963546753, + "learning_rate": 1e-06, + "loss": 0.977, + "mean_token_accuracy": 0.6994285583496094, + "num_tokens": 121974638.0, + "step": 4808 + }, + { + "epoch": 0.5281133318690973, + "grad_norm": 2.1262125968933105, + "learning_rate": 1e-06, + "loss": 0.9736, + "mean_token_accuracy": 0.6995185613632202, + "num_tokens": 122000780.0, + "step": 4809 + }, + { + "epoch": 0.528223149571711, + "grad_norm": 2.1084907054901123, + "learning_rate": 1e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.7089118957519531, + "num_tokens": 122029141.0, + "step": 4810 + }, + { + "epoch": 0.5283329672743247, + "grad_norm": 2.177790641784668, + "learning_rate": 1e-06, + "loss": 1.045, + "mean_token_accuracy": 0.69389808177948, + "num_tokens": 122056170.0, + "step": 4811 + }, + { + "epoch": 0.5284427849769383, + "grad_norm": 2.2917699813842773, + "learning_rate": 1e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7142225503921509, + "num_tokens": 122081315.0, + "step": 4812 + }, + { + "epoch": 0.528552602679552, + "grad_norm": 2.2053351402282715, + "learning_rate": 1e-06, + "loss": 0.9694, + "mean_token_accuracy": 0.7011438012123108, + "num_tokens": 122108509.0, + "step": 4813 + }, + { + "epoch": 0.5286624203821656, + "grad_norm": 2.487576961517334, + "learning_rate": 1e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.7082887887954712, + "num_tokens": 122129618.0, + "step": 4814 + }, + { + "epoch": 0.5287722380847792, + "grad_norm": 2.0992629528045654, + "learning_rate": 1e-06, + "loss": 1.079, + "mean_token_accuracy": 0.6830145716667175, + "num_tokens": 122159537.0, + "step": 4815 + }, + { + "epoch": 0.5288820557873929, + "grad_norm": 2.186195135116577, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.6941869258880615, + "num_tokens": 122185997.0, + "step": 4816 + }, + { + "epoch": 0.5289918734900065, + "grad_norm": 2.3025121688842773, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.705703616142273, + "num_tokens": 122208387.0, + "step": 4817 + }, + { + "epoch": 0.5291016911926203, + "grad_norm": 1.9165737628936768, + "learning_rate": 1e-06, + "loss": 1.0729, + "mean_token_accuracy": 0.6770150065422058, + "num_tokens": 122240880.0, + "step": 4818 + }, + { + "epoch": 0.5292115088952339, + "grad_norm": 2.2428462505340576, + "learning_rate": 1e-06, + "loss": 1.0128, + "mean_token_accuracy": 0.6951144337654114, + "num_tokens": 122268434.0, + "step": 4819 + }, + { + "epoch": 0.5293213265978476, + "grad_norm": 2.282097339630127, + "learning_rate": 1e-06, + "loss": 0.975, + "mean_token_accuracy": 0.7031291723251343, + "num_tokens": 122294126.0, + "step": 4820 + }, + { + "epoch": 0.5294311443004612, + "grad_norm": 2.1676063537597656, + "learning_rate": 1e-06, + "loss": 1.001, + "mean_token_accuracy": 0.6930376291275024, + "num_tokens": 122321715.0, + "step": 4821 + }, + { + "epoch": 0.5295409620030749, + "grad_norm": 1.960985779762268, + "learning_rate": 1e-06, + "loss": 1.0082, + "mean_token_accuracy": 0.6977463960647583, + "num_tokens": 122351833.0, + "step": 4822 + }, + { + "epoch": 0.5296507797056885, + "grad_norm": 2.1034023761749268, + "learning_rate": 1e-06, + "loss": 0.9942, + "mean_token_accuracy": 0.7012697458267212, + "num_tokens": 122380287.0, + "step": 4823 + }, + { + "epoch": 0.5297605974083022, + "grad_norm": 2.1515862941741943, + "learning_rate": 1e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7132571935653687, + "num_tokens": 122406775.0, + "step": 4824 + }, + { + "epoch": 0.5298704151109159, + "grad_norm": 2.226916790008545, + "learning_rate": 1e-06, + "loss": 0.9723, + "mean_token_accuracy": 0.7038632035255432, + "num_tokens": 122433166.0, + "step": 4825 + }, + { + "epoch": 0.5299802328135296, + "grad_norm": 2.467437505722046, + "learning_rate": 1e-06, + "loss": 0.9166, + "mean_token_accuracy": 0.7125622034072876, + "num_tokens": 122457137.0, + "step": 4826 + }, + { + "epoch": 0.5300900505161432, + "grad_norm": 2.39199161529541, + "learning_rate": 1e-06, + "loss": 0.93, + "mean_token_accuracy": 0.7165460586547852, + "num_tokens": 122480612.0, + "step": 4827 + }, + { + "epoch": 0.5301998682187569, + "grad_norm": 2.1679155826568604, + "learning_rate": 1e-06, + "loss": 0.9935, + "mean_token_accuracy": 0.7012957334518433, + "num_tokens": 122505649.0, + "step": 4828 + }, + { + "epoch": 0.5303096859213705, + "grad_norm": 2.303860664367676, + "learning_rate": 1e-06, + "loss": 0.9356, + "mean_token_accuracy": 0.7190892696380615, + "num_tokens": 122527588.0, + "step": 4829 + }, + { + "epoch": 0.5304195036239842, + "grad_norm": 2.053995370864868, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.699860692024231, + "num_tokens": 122558208.0, + "step": 4830 + }, + { + "epoch": 0.5305293213265978, + "grad_norm": 2.0771360397338867, + "learning_rate": 1e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.7243694067001343, + "num_tokens": 122585159.0, + "step": 4831 + }, + { + "epoch": 0.5306391390292116, + "grad_norm": 2.1847217082977295, + "learning_rate": 1e-06, + "loss": 0.9943, + "mean_token_accuracy": 0.6963456869125366, + "num_tokens": 122610042.0, + "step": 4832 + }, + { + "epoch": 0.5307489567318252, + "grad_norm": 2.5908868312835693, + "learning_rate": 1e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.6978606581687927, + "num_tokens": 122630666.0, + "step": 4833 + }, + { + "epoch": 0.5308587744344389, + "grad_norm": 2.27949857711792, + "learning_rate": 1e-06, + "loss": 0.8558, + "mean_token_accuracy": 0.7369856834411621, + "num_tokens": 122653387.0, + "step": 4834 + }, + { + "epoch": 0.5309685921370525, + "grad_norm": 2.3219821453094482, + "learning_rate": 1e-06, + "loss": 1.0056, + "mean_token_accuracy": 0.6972160339355469, + "num_tokens": 122679150.0, + "step": 4835 + }, + { + "epoch": 0.5310784098396661, + "grad_norm": 2.2929141521453857, + "learning_rate": 1e-06, + "loss": 1.0567, + "mean_token_accuracy": 0.6936641931533813, + "num_tokens": 122702703.0, + "step": 4836 + }, + { + "epoch": 0.5311882275422798, + "grad_norm": 2.226996660232544, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7184642553329468, + "num_tokens": 122727564.0, + "step": 4837 + }, + { + "epoch": 0.5312980452448934, + "grad_norm": 2.1571364402770996, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.7135453224182129, + "num_tokens": 122753190.0, + "step": 4838 + }, + { + "epoch": 0.5314078629475072, + "grad_norm": 2.209623336791992, + "learning_rate": 1e-06, + "loss": 1.0144, + "mean_token_accuracy": 0.6918485164642334, + "num_tokens": 122779337.0, + "step": 4839 + }, + { + "epoch": 0.5315176806501208, + "grad_norm": 2.389003038406372, + "learning_rate": 1e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.717440664768219, + "num_tokens": 122799486.0, + "step": 4840 + }, + { + "epoch": 0.5316274983527345, + "grad_norm": 1.9840924739837646, + "learning_rate": 1e-06, + "loss": 1.019, + "mean_token_accuracy": 0.6822564005851746, + "num_tokens": 122829776.0, + "step": 4841 + }, + { + "epoch": 0.5317373160553481, + "grad_norm": 2.313293933868408, + "learning_rate": 1e-06, + "loss": 0.9586, + "mean_token_accuracy": 0.7018007040023804, + "num_tokens": 122852784.0, + "step": 4842 + }, + { + "epoch": 0.5318471337579618, + "grad_norm": 2.1241159439086914, + "learning_rate": 1e-06, + "loss": 0.8992, + "mean_token_accuracy": 0.7221709489822388, + "num_tokens": 122876709.0, + "step": 4843 + }, + { + "epoch": 0.5319569514605754, + "grad_norm": 2.4617621898651123, + "learning_rate": 1e-06, + "loss": 0.9995, + "mean_token_accuracy": 0.7097307443618774, + "num_tokens": 122896772.0, + "step": 4844 + }, + { + "epoch": 0.5320667691631891, + "grad_norm": 2.237865924835205, + "learning_rate": 1e-06, + "loss": 1.0521, + "mean_token_accuracy": 0.6800650358200073, + "num_tokens": 122922411.0, + "step": 4845 + }, + { + "epoch": 0.5321765868658027, + "grad_norm": 2.2982516288757324, + "learning_rate": 1e-06, + "loss": 1.0281, + "mean_token_accuracy": 0.6934551000595093, + "num_tokens": 122945932.0, + "step": 4846 + }, + { + "epoch": 0.5322864045684165, + "grad_norm": 2.17722487449646, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.7064974308013916, + "num_tokens": 122969070.0, + "step": 4847 + }, + { + "epoch": 0.5323962222710301, + "grad_norm": 1.9046401977539062, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.6994529962539673, + "num_tokens": 122999101.0, + "step": 4848 + }, + { + "epoch": 0.5325060399736438, + "grad_norm": 2.2784831523895264, + "learning_rate": 1e-06, + "loss": 0.8769, + "mean_token_accuracy": 0.727048933506012, + "num_tokens": 123022671.0, + "step": 4849 + }, + { + "epoch": 0.5326158576762574, + "grad_norm": 2.171161651611328, + "learning_rate": 1e-06, + "loss": 1.0168, + "mean_token_accuracy": 0.6973202228546143, + "num_tokens": 123049602.0, + "step": 4850 + }, + { + "epoch": 0.532725675378871, + "grad_norm": 2.202066421508789, + "learning_rate": 1e-06, + "loss": 1.002, + "mean_token_accuracy": 0.705906867980957, + "num_tokens": 123075957.0, + "step": 4851 + }, + { + "epoch": 0.5328354930814847, + "grad_norm": 2.2980854511260986, + "learning_rate": 1e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.7030631303787231, + "num_tokens": 123097798.0, + "step": 4852 + }, + { + "epoch": 0.5329453107840983, + "grad_norm": 2.1362435817718506, + "learning_rate": 1e-06, + "loss": 1.0362, + "mean_token_accuracy": 0.6813944578170776, + "num_tokens": 123126776.0, + "step": 4853 + }, + { + "epoch": 0.5330551284867121, + "grad_norm": 2.719921112060547, + "learning_rate": 1e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.7018826007843018, + "num_tokens": 123146084.0, + "step": 4854 + }, + { + "epoch": 0.5331649461893258, + "grad_norm": 2.109194278717041, + "learning_rate": 1e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.6996159553527832, + "num_tokens": 123174739.0, + "step": 4855 + }, + { + "epoch": 0.5332747638919394, + "grad_norm": 2.0399892330169678, + "learning_rate": 1e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.7061389684677124, + "num_tokens": 123201710.0, + "step": 4856 + }, + { + "epoch": 0.533384581594553, + "grad_norm": 2.122220039367676, + "learning_rate": 1e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.7108997106552124, + "num_tokens": 123227820.0, + "step": 4857 + }, + { + "epoch": 0.5334943992971667, + "grad_norm": 2.2628653049468994, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7202528119087219, + "num_tokens": 123252638.0, + "step": 4858 + }, + { + "epoch": 0.5336042169997803, + "grad_norm": 2.0181143283843994, + "learning_rate": 1e-06, + "loss": 0.9748, + "mean_token_accuracy": 0.702608585357666, + "num_tokens": 123280391.0, + "step": 4859 + }, + { + "epoch": 0.533714034702394, + "grad_norm": 2.0015869140625, + "learning_rate": 1e-06, + "loss": 1.0188, + "mean_token_accuracy": 0.698278546333313, + "num_tokens": 123311120.0, + "step": 4860 + }, + { + "epoch": 0.5338238524050077, + "grad_norm": 2.0426464080810547, + "learning_rate": 1e-06, + "loss": 1.0108, + "mean_token_accuracy": 0.6967359781265259, + "num_tokens": 123340132.0, + "step": 4861 + }, + { + "epoch": 0.5339336701076214, + "grad_norm": 2.2145490646362305, + "learning_rate": 1e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7050825953483582, + "num_tokens": 123365711.0, + "step": 4862 + }, + { + "epoch": 0.534043487810235, + "grad_norm": 1.8587223291397095, + "learning_rate": 1e-06, + "loss": 1.0052, + "mean_token_accuracy": 0.6972498297691345, + "num_tokens": 123399565.0, + "step": 4863 + }, + { + "epoch": 0.5341533055128487, + "grad_norm": 2.0268983840942383, + "learning_rate": 1e-06, + "loss": 1.0079, + "mean_token_accuracy": 0.6998770833015442, + "num_tokens": 123426493.0, + "step": 4864 + }, + { + "epoch": 0.5342631232154623, + "grad_norm": 2.1144893169403076, + "learning_rate": 1e-06, + "loss": 1.0163, + "mean_token_accuracy": 0.6962535381317139, + "num_tokens": 123453956.0, + "step": 4865 + }, + { + "epoch": 0.534372940918076, + "grad_norm": 2.5511040687561035, + "learning_rate": 1e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.7071651220321655, + "num_tokens": 123474849.0, + "step": 4866 + }, + { + "epoch": 0.5344827586206896, + "grad_norm": 2.492842674255371, + "learning_rate": 1e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.7059282064437866, + "num_tokens": 123498440.0, + "step": 4867 + }, + { + "epoch": 0.5345925763233034, + "grad_norm": 2.4893040657043457, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7078176736831665, + "num_tokens": 123519803.0, + "step": 4868 + }, + { + "epoch": 0.534702394025917, + "grad_norm": 2.0985348224639893, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7213876247406006, + "num_tokens": 123545102.0, + "step": 4869 + }, + { + "epoch": 0.5348122117285307, + "grad_norm": 2.3051960468292236, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7036937475204468, + "num_tokens": 123569253.0, + "step": 4870 + }, + { + "epoch": 0.5349220294311443, + "grad_norm": 2.2404122352600098, + "learning_rate": 1e-06, + "loss": 1.0138, + "mean_token_accuracy": 0.6926596164703369, + "num_tokens": 123596001.0, + "step": 4871 + }, + { + "epoch": 0.535031847133758, + "grad_norm": 2.2008557319641113, + "learning_rate": 1e-06, + "loss": 0.8525, + "mean_token_accuracy": 0.7207705974578857, + "num_tokens": 123618234.0, + "step": 4872 + }, + { + "epoch": 0.5351416648363716, + "grad_norm": 2.0870893001556396, + "learning_rate": 1e-06, + "loss": 1.105, + "mean_token_accuracy": 0.6762755513191223, + "num_tokens": 123649008.0, + "step": 4873 + }, + { + "epoch": 0.5352514825389852, + "grad_norm": 2.1488683223724365, + "learning_rate": 1e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.7069538235664368, + "num_tokens": 123674268.0, + "step": 4874 + }, + { + "epoch": 0.5353613002415989, + "grad_norm": 2.6900179386138916, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.7039626836776733, + "num_tokens": 123693711.0, + "step": 4875 + }, + { + "epoch": 0.5354711179442126, + "grad_norm": 2.7475457191467285, + "learning_rate": 1e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.7152703404426575, + "num_tokens": 123712492.0, + "step": 4876 + }, + { + "epoch": 0.5355809356468263, + "grad_norm": 2.336104154586792, + "learning_rate": 1e-06, + "loss": 1.0422, + "mean_token_accuracy": 0.6860989332199097, + "num_tokens": 123736109.0, + "step": 4877 + }, + { + "epoch": 0.5356907533494399, + "grad_norm": 2.095412254333496, + "learning_rate": 1e-06, + "loss": 1.0898, + "mean_token_accuracy": 0.6739784479141235, + "num_tokens": 123768282.0, + "step": 4878 + }, + { + "epoch": 0.5358005710520536, + "grad_norm": 2.0479896068573, + "learning_rate": 1e-06, + "loss": 0.9954, + "mean_token_accuracy": 0.6956313252449036, + "num_tokens": 123796130.0, + "step": 4879 + }, + { + "epoch": 0.5359103887546672, + "grad_norm": 2.209709882736206, + "learning_rate": 1e-06, + "loss": 1.0792, + "mean_token_accuracy": 0.682321310043335, + "num_tokens": 123821552.0, + "step": 4880 + }, + { + "epoch": 0.5360202064572809, + "grad_norm": 2.0728201866149902, + "learning_rate": 1e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.7080339789390564, + "num_tokens": 123847735.0, + "step": 4881 + }, + { + "epoch": 0.5361300241598945, + "grad_norm": 2.390361785888672, + "learning_rate": 1e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.7039961218833923, + "num_tokens": 123868820.0, + "step": 4882 + }, + { + "epoch": 0.5362398418625083, + "grad_norm": 2.328098773956299, + "learning_rate": 1e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.727224588394165, + "num_tokens": 123891526.0, + "step": 4883 + }, + { + "epoch": 0.5363496595651219, + "grad_norm": 2.402369260787964, + "learning_rate": 1e-06, + "loss": 0.9188, + "mean_token_accuracy": 0.7221084833145142, + "num_tokens": 123912733.0, + "step": 4884 + }, + { + "epoch": 0.5364594772677356, + "grad_norm": 2.502150058746338, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.7182510495185852, + "num_tokens": 123932848.0, + "step": 4885 + }, + { + "epoch": 0.5365692949703492, + "grad_norm": 2.168048143386841, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7080543041229248, + "num_tokens": 123958333.0, + "step": 4886 + }, + { + "epoch": 0.5366791126729629, + "grad_norm": 2.4138362407684326, + "learning_rate": 1e-06, + "loss": 1.0082, + "mean_token_accuracy": 0.7112665772438049, + "num_tokens": 123980723.0, + "step": 4887 + }, + { + "epoch": 0.5367889303755765, + "grad_norm": 2.1039087772369385, + "learning_rate": 1e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.7146525382995605, + "num_tokens": 124010521.0, + "step": 4888 + }, + { + "epoch": 0.5368987480781902, + "grad_norm": 2.353050708770752, + "learning_rate": 1e-06, + "loss": 1.0587, + "mean_token_accuracy": 0.6830713748931885, + "num_tokens": 124035996.0, + "step": 4889 + }, + { + "epoch": 0.5370085657808039, + "grad_norm": 1.9259310960769653, + "learning_rate": 1e-06, + "loss": 1.0246, + "mean_token_accuracy": 0.6892611384391785, + "num_tokens": 124068443.0, + "step": 4890 + }, + { + "epoch": 0.5371183834834176, + "grad_norm": 2.4370837211608887, + "learning_rate": 1e-06, + "loss": 0.9712, + "mean_token_accuracy": 0.7022675275802612, + "num_tokens": 124091692.0, + "step": 4891 + }, + { + "epoch": 0.5372282011860312, + "grad_norm": 2.3961265087127686, + "learning_rate": 1e-06, + "loss": 0.9007, + "mean_token_accuracy": 0.7189663648605347, + "num_tokens": 124115675.0, + "step": 4892 + }, + { + "epoch": 0.5373380188886449, + "grad_norm": 1.8953391313552856, + "learning_rate": 1e-06, + "loss": 1.039, + "mean_token_accuracy": 0.6865818500518799, + "num_tokens": 124148953.0, + "step": 4893 + }, + { + "epoch": 0.5374478365912585, + "grad_norm": 2.2592594623565674, + "learning_rate": 1e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.7127152681350708, + "num_tokens": 124172151.0, + "step": 4894 + }, + { + "epoch": 0.5375576542938721, + "grad_norm": 2.1512975692749023, + "learning_rate": 1e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.6908815503120422, + "num_tokens": 124199154.0, + "step": 4895 + }, + { + "epoch": 0.5376674719964858, + "grad_norm": 2.2881407737731934, + "learning_rate": 1e-06, + "loss": 0.9045, + "mean_token_accuracy": 0.7194519639015198, + "num_tokens": 124221425.0, + "step": 4896 + }, + { + "epoch": 0.5377772896990995, + "grad_norm": 2.186002492904663, + "learning_rate": 1e-06, + "loss": 0.9936, + "mean_token_accuracy": 0.6991163492202759, + "num_tokens": 124250581.0, + "step": 4897 + }, + { + "epoch": 0.5378871074017132, + "grad_norm": 2.3514270782470703, + "learning_rate": 1e-06, + "loss": 1.0417, + "mean_token_accuracy": 0.6857683062553406, + "num_tokens": 124275067.0, + "step": 4898 + }, + { + "epoch": 0.5379969251043268, + "grad_norm": 2.5020508766174316, + "learning_rate": 1e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.7033715844154358, + "num_tokens": 124296888.0, + "step": 4899 + }, + { + "epoch": 0.5381067428069405, + "grad_norm": 2.3296942710876465, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.7062497138977051, + "num_tokens": 124320698.0, + "step": 4900 + }, + { + "epoch": 0.5382165605095541, + "grad_norm": 2.2757036685943604, + "learning_rate": 1e-06, + "loss": 1.0265, + "mean_token_accuracy": 0.6927011013031006, + "num_tokens": 124347252.0, + "step": 4901 + }, + { + "epoch": 0.5383263782121678, + "grad_norm": 2.147040367126465, + "learning_rate": 1e-06, + "loss": 1.0133, + "mean_token_accuracy": 0.6912150382995605, + "num_tokens": 124372530.0, + "step": 4902 + }, + { + "epoch": 0.5384361959147814, + "grad_norm": 2.0303080081939697, + "learning_rate": 1e-06, + "loss": 1.0067, + "mean_token_accuracy": 0.6926059126853943, + "num_tokens": 124403347.0, + "step": 4903 + }, + { + "epoch": 0.5385460136173951, + "grad_norm": 2.12782883644104, + "learning_rate": 1e-06, + "loss": 0.964, + "mean_token_accuracy": 0.7027357816696167, + "num_tokens": 124432452.0, + "step": 4904 + }, + { + "epoch": 0.5386558313200088, + "grad_norm": 2.2469425201416016, + "learning_rate": 1e-06, + "loss": 1.0556, + "mean_token_accuracy": 0.6872924566268921, + "num_tokens": 124458038.0, + "step": 4905 + }, + { + "epoch": 0.5387656490226225, + "grad_norm": 2.4316859245300293, + "learning_rate": 1e-06, + "loss": 1.0915, + "mean_token_accuracy": 0.6795216202735901, + "num_tokens": 124480797.0, + "step": 4906 + }, + { + "epoch": 0.5388754667252361, + "grad_norm": 2.405233383178711, + "learning_rate": 1e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.7284526824951172, + "num_tokens": 124501292.0, + "step": 4907 + }, + { + "epoch": 0.5389852844278498, + "grad_norm": 2.3342912197113037, + "learning_rate": 1e-06, + "loss": 0.8838, + "mean_token_accuracy": 0.7300379276275635, + "num_tokens": 124522194.0, + "step": 4908 + }, + { + "epoch": 0.5390951021304634, + "grad_norm": 1.952383279800415, + "learning_rate": 1e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.7120734453201294, + "num_tokens": 124551953.0, + "step": 4909 + }, + { + "epoch": 0.5392049198330771, + "grad_norm": 2.2021687030792236, + "learning_rate": 1e-06, + "loss": 0.9657, + "mean_token_accuracy": 0.719365119934082, + "num_tokens": 124578935.0, + "step": 4910 + }, + { + "epoch": 0.5393147375356907, + "grad_norm": 2.257499933242798, + "learning_rate": 1e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.7069354057312012, + "num_tokens": 124602266.0, + "step": 4911 + }, + { + "epoch": 0.5394245552383045, + "grad_norm": 2.458080291748047, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.6933398246765137, + "num_tokens": 124623789.0, + "step": 4912 + }, + { + "epoch": 0.5395343729409181, + "grad_norm": 2.204709053039551, + "learning_rate": 1e-06, + "loss": 0.9851, + "mean_token_accuracy": 0.6980528235435486, + "num_tokens": 124649494.0, + "step": 4913 + }, + { + "epoch": 0.5396441906435318, + "grad_norm": 2.1660304069519043, + "learning_rate": 1e-06, + "loss": 1.0061, + "mean_token_accuracy": 0.7008531093597412, + "num_tokens": 124676041.0, + "step": 4914 + }, + { + "epoch": 0.5397540083461454, + "grad_norm": 2.089953899383545, + "learning_rate": 1e-06, + "loss": 0.967, + "mean_token_accuracy": 0.7060433030128479, + "num_tokens": 124701724.0, + "step": 4915 + }, + { + "epoch": 0.539863826048759, + "grad_norm": 2.1111998558044434, + "learning_rate": 1e-06, + "loss": 0.952, + "mean_token_accuracy": 0.7162084579467773, + "num_tokens": 124728246.0, + "step": 4916 + }, + { + "epoch": 0.5399736437513727, + "grad_norm": 2.144998788833618, + "learning_rate": 1e-06, + "loss": 1.0199, + "mean_token_accuracy": 0.690428614616394, + "num_tokens": 124754378.0, + "step": 4917 + }, + { + "epoch": 0.5400834614539863, + "grad_norm": 2.2859458923339844, + "learning_rate": 1e-06, + "loss": 0.961, + "mean_token_accuracy": 0.711603045463562, + "num_tokens": 124776929.0, + "step": 4918 + }, + { + "epoch": 0.5401932791566001, + "grad_norm": 2.559540033340454, + "learning_rate": 1e-06, + "loss": 1.0494, + "mean_token_accuracy": 0.6760512590408325, + "num_tokens": 124797528.0, + "step": 4919 + }, + { + "epoch": 0.5403030968592137, + "grad_norm": 2.6974923610687256, + "learning_rate": 1e-06, + "loss": 0.984, + "mean_token_accuracy": 0.698264479637146, + "num_tokens": 124816160.0, + "step": 4920 + }, + { + "epoch": 0.5404129145618274, + "grad_norm": 1.9962211847305298, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.7032020092010498, + "num_tokens": 124846493.0, + "step": 4921 + }, + { + "epoch": 0.540522732264441, + "grad_norm": 1.9720548391342163, + "learning_rate": 1e-06, + "loss": 1.0306, + "mean_token_accuracy": 0.6876435279846191, + "num_tokens": 124876293.0, + "step": 4922 + }, + { + "epoch": 0.5406325499670547, + "grad_norm": 2.3083646297454834, + "learning_rate": 1e-06, + "loss": 0.9936, + "mean_token_accuracy": 0.6955611705780029, + "num_tokens": 124901308.0, + "step": 4923 + }, + { + "epoch": 0.5407423676696683, + "grad_norm": 2.579381227493286, + "learning_rate": 1e-06, + "loss": 1.0084, + "mean_token_accuracy": 0.6985205411911011, + "num_tokens": 124921238.0, + "step": 4924 + }, + { + "epoch": 0.540852185372282, + "grad_norm": 2.0780446529388428, + "learning_rate": 1e-06, + "loss": 1.0265, + "mean_token_accuracy": 0.6849856376647949, + "num_tokens": 124951438.0, + "step": 4925 + }, + { + "epoch": 0.5409620030748957, + "grad_norm": 2.2608888149261475, + "learning_rate": 1e-06, + "loss": 0.8871, + "mean_token_accuracy": 0.7217884063720703, + "num_tokens": 124974340.0, + "step": 4926 + }, + { + "epoch": 0.5410718207775094, + "grad_norm": 2.137977123260498, + "learning_rate": 1e-06, + "loss": 1.0764, + "mean_token_accuracy": 0.6821469068527222, + "num_tokens": 125000483.0, + "step": 4927 + }, + { + "epoch": 0.541181638480123, + "grad_norm": 2.155059337615967, + "learning_rate": 1e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.722804069519043, + "num_tokens": 125026193.0, + "step": 4928 + }, + { + "epoch": 0.5412914561827367, + "grad_norm": 2.278311252593994, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.6943039894104004, + "num_tokens": 125050628.0, + "step": 4929 + }, + { + "epoch": 0.5414012738853503, + "grad_norm": 2.5977561473846436, + "learning_rate": 1e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7107290029525757, + "num_tokens": 125069857.0, + "step": 4930 + }, + { + "epoch": 0.541511091587964, + "grad_norm": 2.3768362998962402, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7146114110946655, + "num_tokens": 125091052.0, + "step": 4931 + }, + { + "epoch": 0.5416209092905776, + "grad_norm": 2.3278307914733887, + "learning_rate": 1e-06, + "loss": 0.998, + "mean_token_accuracy": 0.6985840797424316, + "num_tokens": 125114977.0, + "step": 4932 + }, + { + "epoch": 0.5417307269931912, + "grad_norm": 2.0774149894714355, + "learning_rate": 1e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.7052754163742065, + "num_tokens": 125141782.0, + "step": 4933 + }, + { + "epoch": 0.541840544695805, + "grad_norm": 2.146852731704712, + "learning_rate": 1e-06, + "loss": 1.0426, + "mean_token_accuracy": 0.689108669757843, + "num_tokens": 125169958.0, + "step": 4934 + }, + { + "epoch": 0.5419503623984187, + "grad_norm": 2.5131027698516846, + "learning_rate": 1e-06, + "loss": 0.943, + "mean_token_accuracy": 0.713499903678894, + "num_tokens": 125189784.0, + "step": 4935 + }, + { + "epoch": 0.5420601801010323, + "grad_norm": 2.0296382904052734, + "learning_rate": 1e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7277014851570129, + "num_tokens": 125217208.0, + "step": 4936 + }, + { + "epoch": 0.5421699978036459, + "grad_norm": 2.2547268867492676, + "learning_rate": 1e-06, + "loss": 1.0949, + "mean_token_accuracy": 0.6753376722335815, + "num_tokens": 125241324.0, + "step": 4937 + }, + { + "epoch": 0.5422798155062596, + "grad_norm": 2.1139514446258545, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.6961275339126587, + "num_tokens": 125270444.0, + "step": 4938 + }, + { + "epoch": 0.5423896332088732, + "grad_norm": 2.154693603515625, + "learning_rate": 1e-06, + "loss": 1.0134, + "mean_token_accuracy": 0.7050150036811829, + "num_tokens": 125297677.0, + "step": 4939 + }, + { + "epoch": 0.5424994509114869, + "grad_norm": 2.200692892074585, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.7035472989082336, + "num_tokens": 125323027.0, + "step": 4940 + }, + { + "epoch": 0.5426092686141006, + "grad_norm": 2.3800084590911865, + "learning_rate": 1e-06, + "loss": 1.0551, + "mean_token_accuracy": 0.6825637221336365, + "num_tokens": 125347505.0, + "step": 4941 + }, + { + "epoch": 0.5427190863167143, + "grad_norm": 2.3716845512390137, + "learning_rate": 1e-06, + "loss": 0.8421, + "mean_token_accuracy": 0.7313521504402161, + "num_tokens": 125369355.0, + "step": 4942 + }, + { + "epoch": 0.5428289040193279, + "grad_norm": 2.0889315605163574, + "learning_rate": 1e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.7145307064056396, + "num_tokens": 125396303.0, + "step": 4943 + }, + { + "epoch": 0.5429387217219416, + "grad_norm": 2.2749521732330322, + "learning_rate": 1e-06, + "loss": 0.9352, + "mean_token_accuracy": 0.7163202166557312, + "num_tokens": 125419623.0, + "step": 4944 + }, + { + "epoch": 0.5430485394245552, + "grad_norm": 2.227118968963623, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7169634103775024, + "num_tokens": 125444100.0, + "step": 4945 + }, + { + "epoch": 0.5431583571271689, + "grad_norm": 1.9775205850601196, + "learning_rate": 1e-06, + "loss": 1.0459, + "mean_token_accuracy": 0.6924999356269836, + "num_tokens": 125475319.0, + "step": 4946 + }, + { + "epoch": 0.5432681748297825, + "grad_norm": 2.3745064735412598, + "learning_rate": 1e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.7013270258903503, + "num_tokens": 125497750.0, + "step": 4947 + }, + { + "epoch": 0.5433779925323963, + "grad_norm": 2.2872843742370605, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7068209052085876, + "num_tokens": 125520738.0, + "step": 4948 + }, + { + "epoch": 0.5434878102350099, + "grad_norm": 2.19874906539917, + "learning_rate": 1e-06, + "loss": 1.0236, + "mean_token_accuracy": 0.6935029029846191, + "num_tokens": 125546621.0, + "step": 4949 + }, + { + "epoch": 0.5435976279376236, + "grad_norm": 2.0487568378448486, + "learning_rate": 1e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.7038605213165283, + "num_tokens": 125575439.0, + "step": 4950 + }, + { + "epoch": 0.5437074456402372, + "grad_norm": 2.344268798828125, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.7009627819061279, + "num_tokens": 125599306.0, + "step": 4951 + }, + { + "epoch": 0.5438172633428509, + "grad_norm": 2.5399861335754395, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.702957272529602, + "num_tokens": 125619655.0, + "step": 4952 + }, + { + "epoch": 0.5439270810454645, + "grad_norm": 2.319544553756714, + "learning_rate": 1e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.7230811715126038, + "num_tokens": 125642934.0, + "step": 4953 + }, + { + "epoch": 0.5440368987480781, + "grad_norm": 2.0076661109924316, + "learning_rate": 1e-06, + "loss": 1.0684, + "mean_token_accuracy": 0.6752474904060364, + "num_tokens": 125674459.0, + "step": 4954 + }, + { + "epoch": 0.5441467164506919, + "grad_norm": 1.9891633987426758, + "learning_rate": 1e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.7038207054138184, + "num_tokens": 125703344.0, + "step": 4955 + }, + { + "epoch": 0.5442565341533055, + "grad_norm": 2.215200662612915, + "learning_rate": 1e-06, + "loss": 0.97, + "mean_token_accuracy": 0.706939697265625, + "num_tokens": 125729479.0, + "step": 4956 + }, + { + "epoch": 0.5443663518559192, + "grad_norm": 2.405407428741455, + "learning_rate": 1e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7225202322006226, + "num_tokens": 125750664.0, + "step": 4957 + }, + { + "epoch": 0.5444761695585328, + "grad_norm": 2.3093600273132324, + "learning_rate": 1e-06, + "loss": 1.0396, + "mean_token_accuracy": 0.693023145198822, + "num_tokens": 125774091.0, + "step": 4958 + }, + { + "epoch": 0.5445859872611465, + "grad_norm": 2.494255542755127, + "learning_rate": 1e-06, + "loss": 0.877, + "mean_token_accuracy": 0.7263311147689819, + "num_tokens": 125794174.0, + "step": 4959 + }, + { + "epoch": 0.5446958049637601, + "grad_norm": 2.2404026985168457, + "learning_rate": 1e-06, + "loss": 1.0956, + "mean_token_accuracy": 0.6723653078079224, + "num_tokens": 125820923.0, + "step": 4960 + }, + { + "epoch": 0.5448056226663738, + "grad_norm": 2.226318120956421, + "learning_rate": 1e-06, + "loss": 1.0609, + "mean_token_accuracy": 0.6819069981575012, + "num_tokens": 125846107.0, + "step": 4961 + }, + { + "epoch": 0.5449154403689875, + "grad_norm": 2.537321090698242, + "learning_rate": 1e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.7165930867195129, + "num_tokens": 125866584.0, + "step": 4962 + }, + { + "epoch": 0.5450252580716012, + "grad_norm": 2.365518808364868, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7123897671699524, + "num_tokens": 125889549.0, + "step": 4963 + }, + { + "epoch": 0.5451350757742148, + "grad_norm": 2.39748477935791, + "learning_rate": 1e-06, + "loss": 0.963, + "mean_token_accuracy": 0.704676628112793, + "num_tokens": 125912664.0, + "step": 4964 + }, + { + "epoch": 0.5452448934768285, + "grad_norm": 2.585886240005493, + "learning_rate": 1e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.7183589935302734, + "num_tokens": 125933152.0, + "step": 4965 + }, + { + "epoch": 0.5453547111794421, + "grad_norm": 2.3855345249176025, + "learning_rate": 1e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7156330943107605, + "num_tokens": 125954912.0, + "step": 4966 + }, + { + "epoch": 0.5454645288820558, + "grad_norm": 2.0554535388946533, + "learning_rate": 1e-06, + "loss": 0.985, + "mean_token_accuracy": 0.705816388130188, + "num_tokens": 125986012.0, + "step": 4967 + }, + { + "epoch": 0.5455743465846694, + "grad_norm": 2.472139835357666, + "learning_rate": 1e-06, + "loss": 1.0712, + "mean_token_accuracy": 0.6788660287857056, + "num_tokens": 126007397.0, + "step": 4968 + }, + { + "epoch": 0.5456841642872831, + "grad_norm": 2.1070590019226074, + "learning_rate": 1e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.7103706002235413, + "num_tokens": 126036291.0, + "step": 4969 + }, + { + "epoch": 0.5457939819898968, + "grad_norm": 2.199718475341797, + "learning_rate": 1e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7123925685882568, + "num_tokens": 126061786.0, + "step": 4970 + }, + { + "epoch": 0.5459037996925105, + "grad_norm": 2.5921175479888916, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7208955883979797, + "num_tokens": 126081537.0, + "step": 4971 + }, + { + "epoch": 0.5460136173951241, + "grad_norm": 2.0432283878326416, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.7054013013839722, + "num_tokens": 126112684.0, + "step": 4972 + }, + { + "epoch": 0.5461234350977378, + "grad_norm": 2.393002986907959, + "learning_rate": 1e-06, + "loss": 0.9733, + "mean_token_accuracy": 0.7024046182632446, + "num_tokens": 126135305.0, + "step": 4973 + }, + { + "epoch": 0.5462332528003514, + "grad_norm": 2.1271448135375977, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.7015529274940491, + "num_tokens": 126161969.0, + "step": 4974 + }, + { + "epoch": 0.546343070502965, + "grad_norm": 2.3060882091522217, + "learning_rate": 1e-06, + "loss": 0.9042, + "mean_token_accuracy": 0.7126616835594177, + "num_tokens": 126184326.0, + "step": 4975 + }, + { + "epoch": 0.5464528882055787, + "grad_norm": 2.1133885383605957, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.7011703848838806, + "num_tokens": 126211984.0, + "step": 4976 + }, + { + "epoch": 0.5465627059081924, + "grad_norm": 2.2133147716522217, + "learning_rate": 1e-06, + "loss": 1.0255, + "mean_token_accuracy": 0.689613938331604, + "num_tokens": 126237964.0, + "step": 4977 + }, + { + "epoch": 0.5466725236108061, + "grad_norm": 2.084615468978882, + "learning_rate": 1e-06, + "loss": 1.0307, + "mean_token_accuracy": 0.6901158094406128, + "num_tokens": 126267032.0, + "step": 4978 + }, + { + "epoch": 0.5467823413134197, + "grad_norm": 2.103564500808716, + "learning_rate": 1e-06, + "loss": 1.0323, + "mean_token_accuracy": 0.6887034177780151, + "num_tokens": 126294237.0, + "step": 4979 + }, + { + "epoch": 0.5468921590160334, + "grad_norm": 2.0650341510772705, + "learning_rate": 1e-06, + "loss": 0.9034, + "mean_token_accuracy": 0.7168770432472229, + "num_tokens": 126318662.0, + "step": 4980 + }, + { + "epoch": 0.547001976718647, + "grad_norm": 2.4902920722961426, + "learning_rate": 1e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.694905698299408, + "num_tokens": 126339517.0, + "step": 4981 + }, + { + "epoch": 0.5471117944212607, + "grad_norm": 2.360293388366699, + "learning_rate": 1e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.7173391580581665, + "num_tokens": 126360919.0, + "step": 4982 + }, + { + "epoch": 0.5472216121238743, + "grad_norm": 2.6076300144195557, + "learning_rate": 1e-06, + "loss": 1.021, + "mean_token_accuracy": 0.694690465927124, + "num_tokens": 126378842.0, + "step": 4983 + }, + { + "epoch": 0.5473314298264881, + "grad_norm": 2.0762202739715576, + "learning_rate": 1e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.718362033367157, + "num_tokens": 126406630.0, + "step": 4984 + }, + { + "epoch": 0.5474412475291017, + "grad_norm": 2.580467462539673, + "learning_rate": 1e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7105427980422974, + "num_tokens": 126427181.0, + "step": 4985 + }, + { + "epoch": 0.5475510652317154, + "grad_norm": 2.1100234985351562, + "learning_rate": 1e-06, + "loss": 0.8977, + "mean_token_accuracy": 0.7246840596199036, + "num_tokens": 126453429.0, + "step": 4986 + }, + { + "epoch": 0.547660882934329, + "grad_norm": 2.3856945037841797, + "learning_rate": 1e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7118151783943176, + "num_tokens": 126476458.0, + "step": 4987 + }, + { + "epoch": 0.5477707006369427, + "grad_norm": 2.1315548419952393, + "learning_rate": 1e-06, + "loss": 0.981, + "mean_token_accuracy": 0.6926780939102173, + "num_tokens": 126503865.0, + "step": 4988 + }, + { + "epoch": 0.5478805183395563, + "grad_norm": 2.1176130771636963, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.715521514415741, + "num_tokens": 126530811.0, + "step": 4989 + }, + { + "epoch": 0.54799033604217, + "grad_norm": 2.2186782360076904, + "learning_rate": 1e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.719096839427948, + "num_tokens": 126556812.0, + "step": 4990 + }, + { + "epoch": 0.5481001537447837, + "grad_norm": 2.5287206172943115, + "learning_rate": 1e-06, + "loss": 1.0058, + "mean_token_accuracy": 0.7004321813583374, + "num_tokens": 126577548.0, + "step": 4991 + }, + { + "epoch": 0.5482099714473974, + "grad_norm": 2.3162131309509277, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.711934506893158, + "num_tokens": 126601303.0, + "step": 4992 + }, + { + "epoch": 0.548319789150011, + "grad_norm": 2.078551769256592, + "learning_rate": 1e-06, + "loss": 1.1011, + "mean_token_accuracy": 0.6711352467536926, + "num_tokens": 126630337.0, + "step": 4993 + }, + { + "epoch": 0.5484296068526247, + "grad_norm": 2.2099452018737793, + "learning_rate": 1e-06, + "loss": 0.8805, + "mean_token_accuracy": 0.7241930961608887, + "num_tokens": 126654094.0, + "step": 4994 + }, + { + "epoch": 0.5485394245552383, + "grad_norm": 1.8585911989212036, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.7065476179122925, + "num_tokens": 126685802.0, + "step": 4995 + }, + { + "epoch": 0.5486492422578519, + "grad_norm": 2.100083589553833, + "learning_rate": 1e-06, + "loss": 1.1007, + "mean_token_accuracy": 0.6681552529335022, + "num_tokens": 126715459.0, + "step": 4996 + }, + { + "epoch": 0.5487590599604656, + "grad_norm": 2.265552043914795, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7126947641372681, + "num_tokens": 126740295.0, + "step": 4997 + }, + { + "epoch": 0.5488688776630792, + "grad_norm": 2.4105706214904785, + "learning_rate": 1e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.720736026763916, + "num_tokens": 126763483.0, + "step": 4998 + }, + { + "epoch": 0.548978695365693, + "grad_norm": 2.0712573528289795, + "learning_rate": 1e-06, + "loss": 1.0127, + "mean_token_accuracy": 0.6924273371696472, + "num_tokens": 126790735.0, + "step": 4999 + }, + { + "epoch": 0.5490885130683066, + "grad_norm": 2.614394187927246, + "learning_rate": 1e-06, + "loss": 1.0096, + "mean_token_accuracy": 0.6939029097557068, + "num_tokens": 126809692.0, + "step": 5000 + }, + { + "epoch": 0.5491983307709203, + "grad_norm": 2.075040578842163, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.7125294208526611, + "num_tokens": 126837862.0, + "step": 5001 + }, + { + "epoch": 0.5493081484735339, + "grad_norm": 2.1419949531555176, + "learning_rate": 1e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.7077512741088867, + "num_tokens": 126864651.0, + "step": 5002 + }, + { + "epoch": 0.5494179661761476, + "grad_norm": 2.287188768386841, + "learning_rate": 1e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.7259058952331543, + "num_tokens": 126886327.0, + "step": 5003 + }, + { + "epoch": 0.5495277838787612, + "grad_norm": 2.0396690368652344, + "learning_rate": 1e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.7250064611434937, + "num_tokens": 126915467.0, + "step": 5004 + }, + { + "epoch": 0.5496376015813749, + "grad_norm": 2.187770366668701, + "learning_rate": 1e-06, + "loss": 0.8753, + "mean_token_accuracy": 0.7318242192268372, + "num_tokens": 126939805.0, + "step": 5005 + }, + { + "epoch": 0.5497474192839886, + "grad_norm": 2.1702325344085693, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.718853771686554, + "num_tokens": 126963367.0, + "step": 5006 + }, + { + "epoch": 0.5498572369866023, + "grad_norm": 2.20491886138916, + "learning_rate": 1e-06, + "loss": 0.949, + "mean_token_accuracy": 0.7149837017059326, + "num_tokens": 126987926.0, + "step": 5007 + }, + { + "epoch": 0.5499670546892159, + "grad_norm": 2.393219470977783, + "learning_rate": 1e-06, + "loss": 1.0063, + "mean_token_accuracy": 0.6960537433624268, + "num_tokens": 127011000.0, + "step": 5008 + }, + { + "epoch": 0.5500768723918296, + "grad_norm": 2.2732667922973633, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7072724103927612, + "num_tokens": 127033816.0, + "step": 5009 + }, + { + "epoch": 0.5501866900944432, + "grad_norm": 2.4429516792297363, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.708609402179718, + "num_tokens": 127053916.0, + "step": 5010 + }, + { + "epoch": 0.5502965077970569, + "grad_norm": 2.3230252265930176, + "learning_rate": 1e-06, + "loss": 0.8744, + "mean_token_accuracy": 0.7251980304718018, + "num_tokens": 127074470.0, + "step": 5011 + }, + { + "epoch": 0.5504063254996705, + "grad_norm": 2.133397102355957, + "learning_rate": 1e-06, + "loss": 1.0639, + "mean_token_accuracy": 0.6894426941871643, + "num_tokens": 127103221.0, + "step": 5012 + }, + { + "epoch": 0.5505161432022843, + "grad_norm": 2.1582424640655518, + "learning_rate": 1e-06, + "loss": 1.0132, + "mean_token_accuracy": 0.6906914114952087, + "num_tokens": 127127861.0, + "step": 5013 + }, + { + "epoch": 0.5506259609048979, + "grad_norm": 2.1288866996765137, + "learning_rate": 1e-06, + "loss": 0.8557, + "mean_token_accuracy": 0.7378748655319214, + "num_tokens": 127153502.0, + "step": 5014 + }, + { + "epoch": 0.5507357786075116, + "grad_norm": 2.2039592266082764, + "learning_rate": 1e-06, + "loss": 0.9065, + "mean_token_accuracy": 0.7203667163848877, + "num_tokens": 127178693.0, + "step": 5015 + }, + { + "epoch": 0.5508455963101252, + "grad_norm": 2.1327733993530273, + "learning_rate": 1e-06, + "loss": 1.0569, + "mean_token_accuracy": 0.6844861507415771, + "num_tokens": 127205175.0, + "step": 5016 + }, + { + "epoch": 0.5509554140127388, + "grad_norm": 2.4094510078430176, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.7145191431045532, + "num_tokens": 127226951.0, + "step": 5017 + }, + { + "epoch": 0.5510652317153525, + "grad_norm": 2.3788836002349854, + "learning_rate": 1e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.7127809524536133, + "num_tokens": 127249958.0, + "step": 5018 + }, + { + "epoch": 0.5511750494179661, + "grad_norm": 2.0935919284820557, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7045486569404602, + "num_tokens": 127276962.0, + "step": 5019 + }, + { + "epoch": 0.5512848671205799, + "grad_norm": 2.2624077796936035, + "learning_rate": 1e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.7232342958450317, + "num_tokens": 127302526.0, + "step": 5020 + }, + { + "epoch": 0.5513946848231935, + "grad_norm": 2.1109113693237305, + "learning_rate": 1e-06, + "loss": 1.1043, + "mean_token_accuracy": 0.6786386966705322, + "num_tokens": 127331891.0, + "step": 5021 + }, + { + "epoch": 0.5515045025258072, + "grad_norm": 2.3464725017547607, + "learning_rate": 1e-06, + "loss": 0.9268, + "mean_token_accuracy": 0.7143566608428955, + "num_tokens": 127352850.0, + "step": 5022 + }, + { + "epoch": 0.5516143202284208, + "grad_norm": 2.518756151199341, + "learning_rate": 1e-06, + "loss": 0.8233, + "mean_token_accuracy": 0.7451766729354858, + "num_tokens": 127372250.0, + "step": 5023 + }, + { + "epoch": 0.5517241379310345, + "grad_norm": 2.006223678588867, + "learning_rate": 1e-06, + "loss": 1.0006, + "mean_token_accuracy": 0.6933172345161438, + "num_tokens": 127402116.0, + "step": 5024 + }, + { + "epoch": 0.5518339556336481, + "grad_norm": 2.1593587398529053, + "learning_rate": 1e-06, + "loss": 0.9628, + "mean_token_accuracy": 0.7013906240463257, + "num_tokens": 127428895.0, + "step": 5025 + }, + { + "epoch": 0.5519437733362618, + "grad_norm": 2.3849120140075684, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7132626175880432, + "num_tokens": 127451228.0, + "step": 5026 + }, + { + "epoch": 0.5520535910388754, + "grad_norm": 2.232391834259033, + "learning_rate": 1e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.725431501865387, + "num_tokens": 127475600.0, + "step": 5027 + }, + { + "epoch": 0.5521634087414892, + "grad_norm": 2.3418800830841064, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7116160988807678, + "num_tokens": 127500311.0, + "step": 5028 + }, + { + "epoch": 0.5522732264441028, + "grad_norm": 2.6189041137695312, + "learning_rate": 1e-06, + "loss": 1.0521, + "mean_token_accuracy": 0.6779597401618958, + "num_tokens": 127520493.0, + "step": 5029 + }, + { + "epoch": 0.5523830441467165, + "grad_norm": 2.393249750137329, + "learning_rate": 1e-06, + "loss": 0.974, + "mean_token_accuracy": 0.7010807394981384, + "num_tokens": 127542158.0, + "step": 5030 + }, + { + "epoch": 0.5524928618493301, + "grad_norm": 2.421926736831665, + "learning_rate": 1e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.6948489546775818, + "num_tokens": 127565491.0, + "step": 5031 + }, + { + "epoch": 0.5526026795519438, + "grad_norm": 1.9906845092773438, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.710765540599823, + "num_tokens": 127594707.0, + "step": 5032 + }, + { + "epoch": 0.5527124972545574, + "grad_norm": 2.19636869430542, + "learning_rate": 1e-06, + "loss": 0.9824, + "mean_token_accuracy": 0.6958156824111938, + "num_tokens": 127622835.0, + "step": 5033 + }, + { + "epoch": 0.552822314957171, + "grad_norm": 2.0410547256469727, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.7060741186141968, + "num_tokens": 127649620.0, + "step": 5034 + }, + { + "epoch": 0.5529321326597848, + "grad_norm": 2.4207820892333984, + "learning_rate": 1e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.7122280597686768, + "num_tokens": 127669624.0, + "step": 5035 + }, + { + "epoch": 0.5530419503623984, + "grad_norm": 2.091226100921631, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7090096473693848, + "num_tokens": 127698243.0, + "step": 5036 + }, + { + "epoch": 0.5531517680650121, + "grad_norm": 2.5646002292633057, + "learning_rate": 1e-06, + "loss": 0.8627, + "mean_token_accuracy": 0.7341231107711792, + "num_tokens": 127717046.0, + "step": 5037 + }, + { + "epoch": 0.5532615857676257, + "grad_norm": 2.085575819015503, + "learning_rate": 1e-06, + "loss": 1.0533, + "mean_token_accuracy": 0.6868698596954346, + "num_tokens": 127745955.0, + "step": 5038 + }, + { + "epoch": 0.5533714034702394, + "grad_norm": 2.0740530490875244, + "learning_rate": 1e-06, + "loss": 1.0235, + "mean_token_accuracy": 0.6988070011138916, + "num_tokens": 127775434.0, + "step": 5039 + }, + { + "epoch": 0.553481221172853, + "grad_norm": 2.098923683166504, + "learning_rate": 1e-06, + "loss": 1.124, + "mean_token_accuracy": 0.6660431027412415, + "num_tokens": 127804738.0, + "step": 5040 + }, + { + "epoch": 0.5535910388754667, + "grad_norm": 2.0441946983337402, + "learning_rate": 1e-06, + "loss": 1.0346, + "mean_token_accuracy": 0.6922805905342102, + "num_tokens": 127835058.0, + "step": 5041 + }, + { + "epoch": 0.5537008565780804, + "grad_norm": 2.4220733642578125, + "learning_rate": 1e-06, + "loss": 0.93, + "mean_token_accuracy": 0.7175518274307251, + "num_tokens": 127856839.0, + "step": 5042 + }, + { + "epoch": 0.5538106742806941, + "grad_norm": 2.1785147190093994, + "learning_rate": 1e-06, + "loss": 1.0512, + "mean_token_accuracy": 0.6788031458854675, + "num_tokens": 127882506.0, + "step": 5043 + }, + { + "epoch": 0.5539204919833077, + "grad_norm": 2.1801252365112305, + "learning_rate": 1e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7043715715408325, + "num_tokens": 127908321.0, + "step": 5044 + }, + { + "epoch": 0.5540303096859214, + "grad_norm": 1.9715033769607544, + "learning_rate": 1e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.7029794454574585, + "num_tokens": 127938649.0, + "step": 5045 + }, + { + "epoch": 0.554140127388535, + "grad_norm": 2.326744556427002, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7096059918403625, + "num_tokens": 127962186.0, + "step": 5046 + }, + { + "epoch": 0.5542499450911487, + "grad_norm": 2.2054238319396973, + "learning_rate": 1e-06, + "loss": 0.9381, + "mean_token_accuracy": 0.7092640399932861, + "num_tokens": 127989945.0, + "step": 5047 + }, + { + "epoch": 0.5543597627937623, + "grad_norm": 2.1452231407165527, + "learning_rate": 1e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7146313190460205, + "num_tokens": 128016274.0, + "step": 5048 + }, + { + "epoch": 0.5544695804963761, + "grad_norm": 2.197803020477295, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7158528566360474, + "num_tokens": 128039892.0, + "step": 5049 + }, + { + "epoch": 0.5545793981989897, + "grad_norm": 2.476778507232666, + "learning_rate": 1e-06, + "loss": 1.0084, + "mean_token_accuracy": 0.6973875761032104, + "num_tokens": 128061605.0, + "step": 5050 + }, + { + "epoch": 0.5546892159016034, + "grad_norm": 2.664433717727661, + "learning_rate": 1e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.7114022970199585, + "num_tokens": 128079334.0, + "step": 5051 + }, + { + "epoch": 0.554799033604217, + "grad_norm": 1.9693487882614136, + "learning_rate": 1e-06, + "loss": 0.8769, + "mean_token_accuracy": 0.7199956178665161, + "num_tokens": 128109152.0, + "step": 5052 + }, + { + "epoch": 0.5549088513068307, + "grad_norm": 2.1581642627716064, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.6987049579620361, + "num_tokens": 128135841.0, + "step": 5053 + }, + { + "epoch": 0.5550186690094443, + "grad_norm": 2.1409926414489746, + "learning_rate": 1e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7107702493667603, + "num_tokens": 128161848.0, + "step": 5054 + }, + { + "epoch": 0.555128486712058, + "grad_norm": 2.3476860523223877, + "learning_rate": 1e-06, + "loss": 1.0143, + "mean_token_accuracy": 0.695906400680542, + "num_tokens": 128186958.0, + "step": 5055 + }, + { + "epoch": 0.5552383044146716, + "grad_norm": 2.1480233669281006, + "learning_rate": 1e-06, + "loss": 1.008, + "mean_token_accuracy": 0.6975072026252747, + "num_tokens": 128213477.0, + "step": 5056 + }, + { + "epoch": 0.5553481221172853, + "grad_norm": 1.9932173490524292, + "learning_rate": 1e-06, + "loss": 0.9887, + "mean_token_accuracy": 0.696231484413147, + "num_tokens": 128245205.0, + "step": 5057 + }, + { + "epoch": 0.555457939819899, + "grad_norm": 2.4548962116241455, + "learning_rate": 1e-06, + "loss": 0.9694, + "mean_token_accuracy": 0.7131818532943726, + "num_tokens": 128266098.0, + "step": 5058 + }, + { + "epoch": 0.5555677575225126, + "grad_norm": 2.3872501850128174, + "learning_rate": 1e-06, + "loss": 0.9896, + "mean_token_accuracy": 0.6902115941047668, + "num_tokens": 128288714.0, + "step": 5059 + }, + { + "epoch": 0.5556775752251263, + "grad_norm": 2.1098761558532715, + "learning_rate": 1e-06, + "loss": 1.0473, + "mean_token_accuracy": 0.687849760055542, + "num_tokens": 128316647.0, + "step": 5060 + }, + { + "epoch": 0.5557873929277399, + "grad_norm": 2.368133068084717, + "learning_rate": 1e-06, + "loss": 1.0483, + "mean_token_accuracy": 0.6946299076080322, + "num_tokens": 128339362.0, + "step": 5061 + }, + { + "epoch": 0.5558972106303536, + "grad_norm": 2.2240030765533447, + "learning_rate": 1e-06, + "loss": 1.0446, + "mean_token_accuracy": 0.6896201968193054, + "num_tokens": 128366457.0, + "step": 5062 + }, + { + "epoch": 0.5560070283329672, + "grad_norm": 2.272019624710083, + "learning_rate": 1e-06, + "loss": 1.0469, + "mean_token_accuracy": 0.6812823414802551, + "num_tokens": 128393143.0, + "step": 5063 + }, + { + "epoch": 0.556116846035581, + "grad_norm": 2.3313682079315186, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7041812539100647, + "num_tokens": 128417584.0, + "step": 5064 + }, + { + "epoch": 0.5562266637381946, + "grad_norm": 1.978717565536499, + "learning_rate": 1e-06, + "loss": 1.0564, + "mean_token_accuracy": 0.6870384216308594, + "num_tokens": 128449978.0, + "step": 5065 + }, + { + "epoch": 0.5563364814408083, + "grad_norm": 2.3910470008850098, + "learning_rate": 1e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.6961731314659119, + "num_tokens": 128473331.0, + "step": 5066 + }, + { + "epoch": 0.5564462991434219, + "grad_norm": 2.312244176864624, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7194459438323975, + "num_tokens": 128497640.0, + "step": 5067 + }, + { + "epoch": 0.5565561168460356, + "grad_norm": 2.1362035274505615, + "learning_rate": 1e-06, + "loss": 1.088, + "mean_token_accuracy": 0.6796430349349976, + "num_tokens": 128528902.0, + "step": 5068 + }, + { + "epoch": 0.5566659345486492, + "grad_norm": 2.232210397720337, + "learning_rate": 1e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.6941745281219482, + "num_tokens": 128552026.0, + "step": 5069 + }, + { + "epoch": 0.5567757522512629, + "grad_norm": 2.167767286300659, + "learning_rate": 1e-06, + "loss": 0.966, + "mean_token_accuracy": 0.6976001262664795, + "num_tokens": 128578097.0, + "step": 5070 + }, + { + "epoch": 0.5568855699538766, + "grad_norm": 2.514429807662964, + "learning_rate": 1e-06, + "loss": 0.8421, + "mean_token_accuracy": 0.7351479530334473, + "num_tokens": 128596635.0, + "step": 5071 + }, + { + "epoch": 0.5569953876564903, + "grad_norm": 2.170517921447754, + "learning_rate": 1e-06, + "loss": 1.0459, + "mean_token_accuracy": 0.6833623051643372, + "num_tokens": 128621158.0, + "step": 5072 + }, + { + "epoch": 0.5571052053591039, + "grad_norm": 2.1183764934539795, + "learning_rate": 1e-06, + "loss": 1.0858, + "mean_token_accuracy": 0.6762268543243408, + "num_tokens": 128647937.0, + "step": 5073 + }, + { + "epoch": 0.5572150230617176, + "grad_norm": 2.0617947578430176, + "learning_rate": 1e-06, + "loss": 0.9803, + "mean_token_accuracy": 0.6976280808448792, + "num_tokens": 128675942.0, + "step": 5074 + }, + { + "epoch": 0.5573248407643312, + "grad_norm": 2.2609734535217285, + "learning_rate": 1e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.697619616985321, + "num_tokens": 128699936.0, + "step": 5075 + }, + { + "epoch": 0.5574346584669448, + "grad_norm": 2.4152863025665283, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7152155041694641, + "num_tokens": 128721755.0, + "step": 5076 + }, + { + "epoch": 0.5575444761695585, + "grad_norm": 2.2122654914855957, + "learning_rate": 1e-06, + "loss": 1.0224, + "mean_token_accuracy": 0.6875301599502563, + "num_tokens": 128747183.0, + "step": 5077 + }, + { + "epoch": 0.5576542938721722, + "grad_norm": 2.153989791870117, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7178294062614441, + "num_tokens": 128773619.0, + "step": 5078 + }, + { + "epoch": 0.5577641115747859, + "grad_norm": 2.2063450813293457, + "learning_rate": 1e-06, + "loss": 0.9843, + "mean_token_accuracy": 0.6988350749015808, + "num_tokens": 128798564.0, + "step": 5079 + }, + { + "epoch": 0.5578739292773995, + "grad_norm": 2.2386677265167236, + "learning_rate": 1e-06, + "loss": 0.9926, + "mean_token_accuracy": 0.7031719088554382, + "num_tokens": 128823335.0, + "step": 5080 + }, + { + "epoch": 0.5579837469800132, + "grad_norm": 2.2569148540496826, + "learning_rate": 1e-06, + "loss": 0.9836, + "mean_token_accuracy": 0.7157090902328491, + "num_tokens": 128847762.0, + "step": 5081 + }, + { + "epoch": 0.5580935646826268, + "grad_norm": 2.175534963607788, + "learning_rate": 1e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.7184404134750366, + "num_tokens": 128871411.0, + "step": 5082 + }, + { + "epoch": 0.5582033823852405, + "grad_norm": 2.094132423400879, + "learning_rate": 1e-06, + "loss": 0.9765, + "mean_token_accuracy": 0.7010997533798218, + "num_tokens": 128897317.0, + "step": 5083 + }, + { + "epoch": 0.5583132000878541, + "grad_norm": 2.1736133098602295, + "learning_rate": 1e-06, + "loss": 1.0059, + "mean_token_accuracy": 0.7014092206954956, + "num_tokens": 128922945.0, + "step": 5084 + }, + { + "epoch": 0.5584230177904678, + "grad_norm": 2.1344497203826904, + "learning_rate": 1e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.7040954828262329, + "num_tokens": 128949456.0, + "step": 5085 + }, + { + "epoch": 0.5585328354930815, + "grad_norm": 2.3766531944274902, + "learning_rate": 1e-06, + "loss": 1.014, + "mean_token_accuracy": 0.6910769939422607, + "num_tokens": 128971657.0, + "step": 5086 + }, + { + "epoch": 0.5586426531956952, + "grad_norm": 2.029775381088257, + "learning_rate": 1e-06, + "loss": 0.981, + "mean_token_accuracy": 0.705744743347168, + "num_tokens": 129000442.0, + "step": 5087 + }, + { + "epoch": 0.5587524708983088, + "grad_norm": 2.4208624362945557, + "learning_rate": 1e-06, + "loss": 1.0427, + "mean_token_accuracy": 0.6886026263237, + "num_tokens": 129023214.0, + "step": 5088 + }, + { + "epoch": 0.5588622886009225, + "grad_norm": 2.4755637645721436, + "learning_rate": 1e-06, + "loss": 1.0073, + "mean_token_accuracy": 0.6932145953178406, + "num_tokens": 129043907.0, + "step": 5089 + }, + { + "epoch": 0.5589721063035361, + "grad_norm": 2.16920804977417, + "learning_rate": 1e-06, + "loss": 1.0327, + "mean_token_accuracy": 0.6870467662811279, + "num_tokens": 129068860.0, + "step": 5090 + }, + { + "epoch": 0.5590819240061498, + "grad_norm": 2.153984546661377, + "learning_rate": 1e-06, + "loss": 0.9348, + "mean_token_accuracy": 0.7135870456695557, + "num_tokens": 129092944.0, + "step": 5091 + }, + { + "epoch": 0.5591917417087634, + "grad_norm": 2.2908544540405273, + "learning_rate": 1e-06, + "loss": 1.0659, + "mean_token_accuracy": 0.6873559951782227, + "num_tokens": 129120571.0, + "step": 5092 + }, + { + "epoch": 0.5593015594113772, + "grad_norm": 2.0101094245910645, + "learning_rate": 1e-06, + "loss": 0.9712, + "mean_token_accuracy": 0.709627628326416, + "num_tokens": 129148273.0, + "step": 5093 + }, + { + "epoch": 0.5594113771139908, + "grad_norm": 2.067850351333618, + "learning_rate": 1e-06, + "loss": 1.0237, + "mean_token_accuracy": 0.6930952072143555, + "num_tokens": 129178001.0, + "step": 5094 + }, + { + "epoch": 0.5595211948166045, + "grad_norm": 2.184638738632202, + "learning_rate": 1e-06, + "loss": 1.05, + "mean_token_accuracy": 0.6895800828933716, + "num_tokens": 129204965.0, + "step": 5095 + }, + { + "epoch": 0.5596310125192181, + "grad_norm": 2.0563135147094727, + "learning_rate": 1e-06, + "loss": 0.9869, + "mean_token_accuracy": 0.7003419399261475, + "num_tokens": 129234729.0, + "step": 5096 + }, + { + "epoch": 0.5597408302218317, + "grad_norm": 1.9795502424240112, + "learning_rate": 1e-06, + "loss": 1.0487, + "mean_token_accuracy": 0.6809425950050354, + "num_tokens": 129265021.0, + "step": 5097 + }, + { + "epoch": 0.5598506479244454, + "grad_norm": 2.3070361614227295, + "learning_rate": 1e-06, + "loss": 1.0276, + "mean_token_accuracy": 0.6908971071243286, + "num_tokens": 129289733.0, + "step": 5098 + }, + { + "epoch": 0.559960465627059, + "grad_norm": 2.180349349975586, + "learning_rate": 1e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7047002911567688, + "num_tokens": 129319784.0, + "step": 5099 + }, + { + "epoch": 0.5600702833296728, + "grad_norm": 2.200530767440796, + "learning_rate": 1e-06, + "loss": 1.0411, + "mean_token_accuracy": 0.6984879970550537, + "num_tokens": 129347280.0, + "step": 5100 + }, + { + "epoch": 0.5601801010322864, + "grad_norm": 2.382502317428589, + "learning_rate": 1e-06, + "loss": 0.9635, + "mean_token_accuracy": 0.7050076127052307, + "num_tokens": 129369249.0, + "step": 5101 + }, + { + "epoch": 0.5602899187349001, + "grad_norm": 2.2719225883483887, + "learning_rate": 1e-06, + "loss": 1.0224, + "mean_token_accuracy": 0.6922598481178284, + "num_tokens": 129392998.0, + "step": 5102 + }, + { + "epoch": 0.5603997364375137, + "grad_norm": 2.165649175643921, + "learning_rate": 1e-06, + "loss": 1.0251, + "mean_token_accuracy": 0.6968684196472168, + "num_tokens": 129420232.0, + "step": 5103 + }, + { + "epoch": 0.5605095541401274, + "grad_norm": 2.2730765342712402, + "learning_rate": 1e-06, + "loss": 1.002, + "mean_token_accuracy": 0.6979249715805054, + "num_tokens": 129443211.0, + "step": 5104 + }, + { + "epoch": 0.560619371842741, + "grad_norm": 2.0675365924835205, + "learning_rate": 1e-06, + "loss": 1.0619, + "mean_token_accuracy": 0.685002326965332, + "num_tokens": 129473832.0, + "step": 5105 + }, + { + "epoch": 0.5607291895453547, + "grad_norm": 2.178786277770996, + "learning_rate": 1e-06, + "loss": 1.0007, + "mean_token_accuracy": 0.6954851150512695, + "num_tokens": 129498964.0, + "step": 5106 + }, + { + "epoch": 0.5608390072479684, + "grad_norm": 2.216712236404419, + "learning_rate": 1e-06, + "loss": 0.8662, + "mean_token_accuracy": 0.7317683696746826, + "num_tokens": 129525849.0, + "step": 5107 + }, + { + "epoch": 0.5609488249505821, + "grad_norm": 2.4170498847961426, + "learning_rate": 1e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.7040159106254578, + "num_tokens": 129547867.0, + "step": 5108 + }, + { + "epoch": 0.5610586426531957, + "grad_norm": 2.2350733280181885, + "learning_rate": 1e-06, + "loss": 1.054, + "mean_token_accuracy": 0.6775007247924805, + "num_tokens": 129573527.0, + "step": 5109 + }, + { + "epoch": 0.5611684603558094, + "grad_norm": 2.4710183143615723, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7057914733886719, + "num_tokens": 129594234.0, + "step": 5110 + }, + { + "epoch": 0.561278278058423, + "grad_norm": 1.9759156703948975, + "learning_rate": 1e-06, + "loss": 1.0158, + "mean_token_accuracy": 0.6938644051551819, + "num_tokens": 129622800.0, + "step": 5111 + }, + { + "epoch": 0.5613880957610367, + "grad_norm": 2.1475815773010254, + "learning_rate": 1e-06, + "loss": 1.073, + "mean_token_accuracy": 0.6952177882194519, + "num_tokens": 129648754.0, + "step": 5112 + }, + { + "epoch": 0.5614979134636503, + "grad_norm": 2.380744218826294, + "learning_rate": 1e-06, + "loss": 1.083, + "mean_token_accuracy": 0.679423987865448, + "num_tokens": 129672442.0, + "step": 5113 + }, + { + "epoch": 0.5616077311662641, + "grad_norm": 2.3484184741973877, + "learning_rate": 1e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.7062162160873413, + "num_tokens": 129694040.0, + "step": 5114 + }, + { + "epoch": 0.5617175488688777, + "grad_norm": 2.1362314224243164, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.6948291063308716, + "num_tokens": 129721135.0, + "step": 5115 + }, + { + "epoch": 0.5618273665714913, + "grad_norm": 2.108774423599243, + "learning_rate": 1e-06, + "loss": 0.8802, + "mean_token_accuracy": 0.7310051918029785, + "num_tokens": 129747768.0, + "step": 5116 + }, + { + "epoch": 0.561937184274105, + "grad_norm": 2.247790575027466, + "learning_rate": 1e-06, + "loss": 0.881, + "mean_token_accuracy": 0.7306782007217407, + "num_tokens": 129771053.0, + "step": 5117 + }, + { + "epoch": 0.5620470019767186, + "grad_norm": 1.9699461460113525, + "learning_rate": 1e-06, + "loss": 1.0866, + "mean_token_accuracy": 0.6740666031837463, + "num_tokens": 129804907.0, + "step": 5118 + }, + { + "epoch": 0.5621568196793323, + "grad_norm": 2.5287606716156006, + "learning_rate": 1e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7146366834640503, + "num_tokens": 129826283.0, + "step": 5119 + }, + { + "epoch": 0.5622666373819459, + "grad_norm": 2.1039059162139893, + "learning_rate": 1e-06, + "loss": 1.0, + "mean_token_accuracy": 0.7032188177108765, + "num_tokens": 129855473.0, + "step": 5120 + }, + { + "epoch": 0.5623764550845596, + "grad_norm": 2.0192503929138184, + "learning_rate": 1e-06, + "loss": 1.0048, + "mean_token_accuracy": 0.6920291185379028, + "num_tokens": 129885642.0, + "step": 5121 + }, + { + "epoch": 0.5624862727871733, + "grad_norm": 2.1168553829193115, + "learning_rate": 1e-06, + "loss": 0.9282, + "mean_token_accuracy": 0.720651388168335, + "num_tokens": 129910349.0, + "step": 5122 + }, + { + "epoch": 0.562596090489787, + "grad_norm": 1.9870944023132324, + "learning_rate": 1e-06, + "loss": 1.0863, + "mean_token_accuracy": 0.6735632419586182, + "num_tokens": 129943887.0, + "step": 5123 + }, + { + "epoch": 0.5627059081924006, + "grad_norm": 2.2240185737609863, + "learning_rate": 1e-06, + "loss": 0.8994, + "mean_token_accuracy": 0.7202051877975464, + "num_tokens": 129968981.0, + "step": 5124 + }, + { + "epoch": 0.5628157258950143, + "grad_norm": 2.413320302963257, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.6933554410934448, + "num_tokens": 129993666.0, + "step": 5125 + }, + { + "epoch": 0.5629255435976279, + "grad_norm": 2.1111085414886475, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7007997632026672, + "num_tokens": 130021444.0, + "step": 5126 + }, + { + "epoch": 0.5630353613002416, + "grad_norm": 2.310868501663208, + "learning_rate": 1e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.7126063704490662, + "num_tokens": 130043874.0, + "step": 5127 + }, + { + "epoch": 0.5631451790028552, + "grad_norm": 2.4817018508911133, + "learning_rate": 1e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7177262902259827, + "num_tokens": 130064162.0, + "step": 5128 + }, + { + "epoch": 0.563254996705469, + "grad_norm": 2.213012456893921, + "learning_rate": 1e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.7037241458892822, + "num_tokens": 130091359.0, + "step": 5129 + }, + { + "epoch": 0.5633648144080826, + "grad_norm": 2.0688483715057373, + "learning_rate": 1e-06, + "loss": 1.1117, + "mean_token_accuracy": 0.6728286147117615, + "num_tokens": 130122605.0, + "step": 5130 + }, + { + "epoch": 0.5634746321106963, + "grad_norm": 1.8722432851791382, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7219091653823853, + "num_tokens": 130155909.0, + "step": 5131 + }, + { + "epoch": 0.5635844498133099, + "grad_norm": 2.2037103176116943, + "learning_rate": 1e-06, + "loss": 1.017, + "mean_token_accuracy": 0.691779375076294, + "num_tokens": 130183034.0, + "step": 5132 + }, + { + "epoch": 0.5636942675159236, + "grad_norm": 2.315556287765503, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7057927846908569, + "num_tokens": 130205609.0, + "step": 5133 + }, + { + "epoch": 0.5638040852185372, + "grad_norm": 2.1318087577819824, + "learning_rate": 1e-06, + "loss": 0.872, + "mean_token_accuracy": 0.7342054843902588, + "num_tokens": 130232611.0, + "step": 5134 + }, + { + "epoch": 0.5639139029211508, + "grad_norm": 2.077284574508667, + "learning_rate": 1e-06, + "loss": 0.9798, + "mean_token_accuracy": 0.7050908803939819, + "num_tokens": 130260902.0, + "step": 5135 + }, + { + "epoch": 0.5640237206237646, + "grad_norm": 2.27156662940979, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.7001532316207886, + "num_tokens": 130284610.0, + "step": 5136 + }, + { + "epoch": 0.5641335383263782, + "grad_norm": 2.0688066482543945, + "learning_rate": 1e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.7071673274040222, + "num_tokens": 130312301.0, + "step": 5137 + }, + { + "epoch": 0.5642433560289919, + "grad_norm": 2.403895378112793, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.7044002413749695, + "num_tokens": 130336163.0, + "step": 5138 + }, + { + "epoch": 0.5643531737316055, + "grad_norm": 2.072566509246826, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7156516909599304, + "num_tokens": 130362461.0, + "step": 5139 + }, + { + "epoch": 0.5644629914342192, + "grad_norm": 2.1845269203186035, + "learning_rate": 1e-06, + "loss": 1.0232, + "mean_token_accuracy": 0.6899971961975098, + "num_tokens": 130390338.0, + "step": 5140 + }, + { + "epoch": 0.5645728091368328, + "grad_norm": 2.1950771808624268, + "learning_rate": 1e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.7271180152893066, + "num_tokens": 130417275.0, + "step": 5141 + }, + { + "epoch": 0.5646826268394465, + "grad_norm": 2.2872047424316406, + "learning_rate": 1e-06, + "loss": 1.0441, + "mean_token_accuracy": 0.6839879155158997, + "num_tokens": 130442571.0, + "step": 5142 + }, + { + "epoch": 0.5647924445420602, + "grad_norm": 2.0542123317718506, + "learning_rate": 1e-06, + "loss": 0.9956, + "mean_token_accuracy": 0.7085211277008057, + "num_tokens": 130472791.0, + "step": 5143 + }, + { + "epoch": 0.5649022622446739, + "grad_norm": 2.340810537338257, + "learning_rate": 1e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.720643937587738, + "num_tokens": 130493622.0, + "step": 5144 + }, + { + "epoch": 0.5650120799472875, + "grad_norm": 2.2516229152679443, + "learning_rate": 1e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.7081283330917358, + "num_tokens": 130517358.0, + "step": 5145 + }, + { + "epoch": 0.5651218976499012, + "grad_norm": 2.1204042434692383, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.7031069397926331, + "num_tokens": 130545125.0, + "step": 5146 + }, + { + "epoch": 0.5652317153525148, + "grad_norm": 2.2402424812316895, + "learning_rate": 1e-06, + "loss": 1.026, + "mean_token_accuracy": 0.6827185153961182, + "num_tokens": 130569971.0, + "step": 5147 + }, + { + "epoch": 0.5653415330551285, + "grad_norm": 2.1147961616516113, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.706229567527771, + "num_tokens": 130596320.0, + "step": 5148 + }, + { + "epoch": 0.5654513507577421, + "grad_norm": 2.0618839263916016, + "learning_rate": 1e-06, + "loss": 0.8806, + "mean_token_accuracy": 0.7225902676582336, + "num_tokens": 130624109.0, + "step": 5149 + }, + { + "epoch": 0.5655611684603558, + "grad_norm": 2.269991636276245, + "learning_rate": 1e-06, + "loss": 0.9673, + "mean_token_accuracy": 0.7077440023422241, + "num_tokens": 130647376.0, + "step": 5150 + }, + { + "epoch": 0.5656709861629695, + "grad_norm": 2.335193157196045, + "learning_rate": 1e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.7084075212478638, + "num_tokens": 130671224.0, + "step": 5151 + }, + { + "epoch": 0.5657808038655832, + "grad_norm": 2.192420244216919, + "learning_rate": 1e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7142132520675659, + "num_tokens": 130695530.0, + "step": 5152 + }, + { + "epoch": 0.5658906215681968, + "grad_norm": 2.201977491378784, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.697723925113678, + "num_tokens": 130722661.0, + "step": 5153 + }, + { + "epoch": 0.5660004392708105, + "grad_norm": 2.011564016342163, + "learning_rate": 1e-06, + "loss": 0.932, + "mean_token_accuracy": 0.7130633592605591, + "num_tokens": 130751667.0, + "step": 5154 + }, + { + "epoch": 0.5661102569734241, + "grad_norm": 2.1383211612701416, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7112878561019897, + "num_tokens": 130780157.0, + "step": 5155 + }, + { + "epoch": 0.5662200746760377, + "grad_norm": 2.2148184776306152, + "learning_rate": 1e-06, + "loss": 1.0342, + "mean_token_accuracy": 0.6861103177070618, + "num_tokens": 130807037.0, + "step": 5156 + }, + { + "epoch": 0.5663298923786514, + "grad_norm": 2.2818164825439453, + "learning_rate": 1e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.7185405492782593, + "num_tokens": 130833002.0, + "step": 5157 + }, + { + "epoch": 0.5664397100812651, + "grad_norm": 2.131432056427002, + "learning_rate": 1e-06, + "loss": 0.9825, + "mean_token_accuracy": 0.709587812423706, + "num_tokens": 130859852.0, + "step": 5158 + }, + { + "epoch": 0.5665495277838788, + "grad_norm": 2.674468994140625, + "learning_rate": 1e-06, + "loss": 0.9765, + "mean_token_accuracy": 0.7006855607032776, + "num_tokens": 130877845.0, + "step": 5159 + }, + { + "epoch": 0.5666593454864924, + "grad_norm": 2.2676100730895996, + "learning_rate": 1e-06, + "loss": 0.8878, + "mean_token_accuracy": 0.7280642986297607, + "num_tokens": 130902126.0, + "step": 5160 + }, + { + "epoch": 0.5667691631891061, + "grad_norm": 2.778238296508789, + "learning_rate": 1e-06, + "loss": 0.8388, + "mean_token_accuracy": 0.7349017262458801, + "num_tokens": 130917848.0, + "step": 5161 + }, + { + "epoch": 0.5668789808917197, + "grad_norm": 2.2749269008636475, + "learning_rate": 1e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.7188624739646912, + "num_tokens": 130940488.0, + "step": 5162 + }, + { + "epoch": 0.5669887985943334, + "grad_norm": 1.949161171913147, + "learning_rate": 1e-06, + "loss": 0.9847, + "mean_token_accuracy": 0.6994398832321167, + "num_tokens": 130971572.0, + "step": 5163 + }, + { + "epoch": 0.567098616296947, + "grad_norm": 1.9777568578720093, + "learning_rate": 1e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.7145754098892212, + "num_tokens": 131001916.0, + "step": 5164 + }, + { + "epoch": 0.5672084339995608, + "grad_norm": 2.3818318843841553, + "learning_rate": 1e-06, + "loss": 0.9975, + "mean_token_accuracy": 0.6992394924163818, + "num_tokens": 131023710.0, + "step": 5165 + }, + { + "epoch": 0.5673182517021744, + "grad_norm": 2.0642032623291016, + "learning_rate": 1e-06, + "loss": 0.9891, + "mean_token_accuracy": 0.698756992816925, + "num_tokens": 131054779.0, + "step": 5166 + }, + { + "epoch": 0.5674280694047881, + "grad_norm": 2.1056535243988037, + "learning_rate": 1e-06, + "loss": 1.0022, + "mean_token_accuracy": 0.6981917023658752, + "num_tokens": 131083731.0, + "step": 5167 + }, + { + "epoch": 0.5675378871074017, + "grad_norm": 2.543170213699341, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7292333841323853, + "num_tokens": 131102866.0, + "step": 5168 + }, + { + "epoch": 0.5676477048100154, + "grad_norm": 1.9891468286514282, + "learning_rate": 1e-06, + "loss": 1.0242, + "mean_token_accuracy": 0.6856861114501953, + "num_tokens": 131132781.0, + "step": 5169 + }, + { + "epoch": 0.567757522512629, + "grad_norm": 2.58532452583313, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7108888626098633, + "num_tokens": 131152572.0, + "step": 5170 + }, + { + "epoch": 0.5678673402152427, + "grad_norm": 2.189074754714966, + "learning_rate": 1e-06, + "loss": 0.9084, + "mean_token_accuracy": 0.7200558185577393, + "num_tokens": 131178223.0, + "step": 5171 + }, + { + "epoch": 0.5679771579178564, + "grad_norm": 2.522566318511963, + "learning_rate": 1e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.6975107192993164, + "num_tokens": 131199241.0, + "step": 5172 + }, + { + "epoch": 0.5680869756204701, + "grad_norm": 2.1027584075927734, + "learning_rate": 1e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.7036442756652832, + "num_tokens": 131227010.0, + "step": 5173 + }, + { + "epoch": 0.5681967933230837, + "grad_norm": 2.074903726577759, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.7003294825553894, + "num_tokens": 131256380.0, + "step": 5174 + }, + { + "epoch": 0.5683066110256974, + "grad_norm": 2.562260389328003, + "learning_rate": 1e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.7230198383331299, + "num_tokens": 131277383.0, + "step": 5175 + }, + { + "epoch": 0.568416428728311, + "grad_norm": 2.108572483062744, + "learning_rate": 1e-06, + "loss": 0.9956, + "mean_token_accuracy": 0.69608074426651, + "num_tokens": 131306145.0, + "step": 5176 + }, + { + "epoch": 0.5685262464309246, + "grad_norm": 2.5819101333618164, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.7094406485557556, + "num_tokens": 131327867.0, + "step": 5177 + }, + { + "epoch": 0.5686360641335383, + "grad_norm": 1.8661648035049438, + "learning_rate": 1e-06, + "loss": 1.0381, + "mean_token_accuracy": 0.687741219997406, + "num_tokens": 131362595.0, + "step": 5178 + }, + { + "epoch": 0.5687458818361519, + "grad_norm": 2.247943878173828, + "learning_rate": 1e-06, + "loss": 1.0153, + "mean_token_accuracy": 0.6958209872245789, + "num_tokens": 131387688.0, + "step": 5179 + }, + { + "epoch": 0.5688556995387657, + "grad_norm": 2.0630648136138916, + "learning_rate": 1e-06, + "loss": 1.0163, + "mean_token_accuracy": 0.6925811767578125, + "num_tokens": 131416225.0, + "step": 5180 + }, + { + "epoch": 0.5689655172413793, + "grad_norm": 2.1320221424102783, + "learning_rate": 1e-06, + "loss": 1.002, + "mean_token_accuracy": 0.6916155815124512, + "num_tokens": 131443768.0, + "step": 5181 + }, + { + "epoch": 0.569075334943993, + "grad_norm": 2.1510298252105713, + "learning_rate": 1e-06, + "loss": 1.0488, + "mean_token_accuracy": 0.686629593372345, + "num_tokens": 131472451.0, + "step": 5182 + }, + { + "epoch": 0.5691851526466066, + "grad_norm": 1.8823585510253906, + "learning_rate": 1e-06, + "loss": 0.971, + "mean_token_accuracy": 0.7026865482330322, + "num_tokens": 131505069.0, + "step": 5183 + }, + { + "epoch": 0.5692949703492203, + "grad_norm": 2.3727471828460693, + "learning_rate": 1e-06, + "loss": 0.9971, + "mean_token_accuracy": 0.6999741196632385, + "num_tokens": 131529641.0, + "step": 5184 + }, + { + "epoch": 0.5694047880518339, + "grad_norm": 2.000673294067383, + "learning_rate": 1e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.6994300484657288, + "num_tokens": 131559740.0, + "step": 5185 + }, + { + "epoch": 0.5695146057544476, + "grad_norm": 2.205796957015991, + "learning_rate": 1e-06, + "loss": 0.9281, + "mean_token_accuracy": 0.730655312538147, + "num_tokens": 131583085.0, + "step": 5186 + }, + { + "epoch": 0.5696244234570613, + "grad_norm": 2.3194401264190674, + "learning_rate": 1e-06, + "loss": 1.0028, + "mean_token_accuracy": 0.6953302621841431, + "num_tokens": 131607495.0, + "step": 5187 + }, + { + "epoch": 0.569734241159675, + "grad_norm": 2.154165267944336, + "learning_rate": 1e-06, + "loss": 0.8936, + "mean_token_accuracy": 0.7302060127258301, + "num_tokens": 131633636.0, + "step": 5188 + }, + { + "epoch": 0.5698440588622886, + "grad_norm": 2.5171937942504883, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7211076021194458, + "num_tokens": 131653876.0, + "step": 5189 + }, + { + "epoch": 0.5699538765649023, + "grad_norm": 2.2941973209381104, + "learning_rate": 1e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.7080575823783875, + "num_tokens": 131676797.0, + "step": 5190 + }, + { + "epoch": 0.5700636942675159, + "grad_norm": 2.0659615993499756, + "learning_rate": 1e-06, + "loss": 1.0289, + "mean_token_accuracy": 0.6890944242477417, + "num_tokens": 131705184.0, + "step": 5191 + }, + { + "epoch": 0.5701735119701296, + "grad_norm": 2.0215542316436768, + "learning_rate": 1e-06, + "loss": 1.003, + "mean_token_accuracy": 0.6956958770751953, + "num_tokens": 131733151.0, + "step": 5192 + }, + { + "epoch": 0.5702833296727432, + "grad_norm": 2.2946078777313232, + "learning_rate": 1e-06, + "loss": 1.0021, + "mean_token_accuracy": 0.699272096157074, + "num_tokens": 131756653.0, + "step": 5193 + }, + { + "epoch": 0.570393147375357, + "grad_norm": 2.448500156402588, + "learning_rate": 1e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.7274880409240723, + "num_tokens": 131776070.0, + "step": 5194 + }, + { + "epoch": 0.5705029650779706, + "grad_norm": 1.8822592496871948, + "learning_rate": 1e-06, + "loss": 1.0428, + "mean_token_accuracy": 0.6825692653656006, + "num_tokens": 131812114.0, + "step": 5195 + }, + { + "epoch": 0.5706127827805842, + "grad_norm": 2.0712311267852783, + "learning_rate": 1e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.7073302268981934, + "num_tokens": 131838789.0, + "step": 5196 + }, + { + "epoch": 0.5707226004831979, + "grad_norm": 2.1616158485412598, + "learning_rate": 1e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.6967736482620239, + "num_tokens": 131864241.0, + "step": 5197 + }, + { + "epoch": 0.5708324181858115, + "grad_norm": 2.3174617290496826, + "learning_rate": 1e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.7014620304107666, + "num_tokens": 131885983.0, + "step": 5198 + }, + { + "epoch": 0.5709422358884252, + "grad_norm": 2.2239205837249756, + "learning_rate": 1e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.6993926763534546, + "num_tokens": 131911001.0, + "step": 5199 + }, + { + "epoch": 0.5710520535910388, + "grad_norm": 2.1909873485565186, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7214455604553223, + "num_tokens": 131935881.0, + "step": 5200 + }, + { + "epoch": 0.5711618712936526, + "grad_norm": 2.227660655975342, + "learning_rate": 1e-06, + "loss": 1.0076, + "mean_token_accuracy": 0.6999640464782715, + "num_tokens": 131962832.0, + "step": 5201 + }, + { + "epoch": 0.5712716889962662, + "grad_norm": 1.9756337404251099, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.6994683742523193, + "num_tokens": 131995191.0, + "step": 5202 + }, + { + "epoch": 0.5713815066988799, + "grad_norm": 2.1255550384521484, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7091919183731079, + "num_tokens": 132020241.0, + "step": 5203 + }, + { + "epoch": 0.5714913244014935, + "grad_norm": 2.129427194595337, + "learning_rate": 1e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.7016105651855469, + "num_tokens": 132046769.0, + "step": 5204 + }, + { + "epoch": 0.5716011421041072, + "grad_norm": 2.5660619735717773, + "learning_rate": 1e-06, + "loss": 0.946, + "mean_token_accuracy": 0.7114615440368652, + "num_tokens": 132066303.0, + "step": 5205 + }, + { + "epoch": 0.5717109598067208, + "grad_norm": 2.2173073291778564, + "learning_rate": 1e-06, + "loss": 1.0438, + "mean_token_accuracy": 0.6922067999839783, + "num_tokens": 132092831.0, + "step": 5206 + }, + { + "epoch": 0.5718207775093345, + "grad_norm": 2.0572125911712646, + "learning_rate": 1e-06, + "loss": 0.8934, + "mean_token_accuracy": 0.7313258051872253, + "num_tokens": 132119567.0, + "step": 5207 + }, + { + "epoch": 0.5719305952119481, + "grad_norm": 2.354036569595337, + "learning_rate": 1e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.6981204152107239, + "num_tokens": 132142553.0, + "step": 5208 + }, + { + "epoch": 0.5720404129145619, + "grad_norm": 2.2076077461242676, + "learning_rate": 1e-06, + "loss": 1.0387, + "mean_token_accuracy": 0.685465931892395, + "num_tokens": 132168768.0, + "step": 5209 + }, + { + "epoch": 0.5721502306171755, + "grad_norm": 2.154621124267578, + "learning_rate": 1e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.6963744163513184, + "num_tokens": 132195039.0, + "step": 5210 + }, + { + "epoch": 0.5722600483197892, + "grad_norm": 2.584642171859741, + "learning_rate": 1e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.7038865685462952, + "num_tokens": 132214966.0, + "step": 5211 + }, + { + "epoch": 0.5723698660224028, + "grad_norm": 2.40175724029541, + "learning_rate": 1e-06, + "loss": 0.9332, + "mean_token_accuracy": 0.7108505964279175, + "num_tokens": 132236265.0, + "step": 5212 + }, + { + "epoch": 0.5724796837250165, + "grad_norm": 2.0908780097961426, + "learning_rate": 1e-06, + "loss": 0.972, + "mean_token_accuracy": 0.7014362812042236, + "num_tokens": 132265364.0, + "step": 5213 + }, + { + "epoch": 0.5725895014276301, + "grad_norm": 2.066406726837158, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.714402437210083, + "num_tokens": 132291980.0, + "step": 5214 + }, + { + "epoch": 0.5726993191302437, + "grad_norm": 2.29683518409729, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.6934685707092285, + "num_tokens": 132316587.0, + "step": 5215 + }, + { + "epoch": 0.5728091368328575, + "grad_norm": 2.079700469970703, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.6974152326583862, + "num_tokens": 132343883.0, + "step": 5216 + }, + { + "epoch": 0.5729189545354711, + "grad_norm": 2.2193562984466553, + "learning_rate": 1e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.6875762939453125, + "num_tokens": 132370359.0, + "step": 5217 + }, + { + "epoch": 0.5730287722380848, + "grad_norm": 2.080465793609619, + "learning_rate": 1e-06, + "loss": 1.0509, + "mean_token_accuracy": 0.6894962787628174, + "num_tokens": 132400838.0, + "step": 5218 + }, + { + "epoch": 0.5731385899406984, + "grad_norm": 2.3413989543914795, + "learning_rate": 1e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.7054136991500854, + "num_tokens": 132424110.0, + "step": 5219 + }, + { + "epoch": 0.5732484076433121, + "grad_norm": 2.449083089828491, + "learning_rate": 1e-06, + "loss": 0.8866, + "mean_token_accuracy": 0.721823513507843, + "num_tokens": 132444579.0, + "step": 5220 + }, + { + "epoch": 0.5733582253459257, + "grad_norm": 2.210512399673462, + "learning_rate": 1e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.6993774175643921, + "num_tokens": 132470259.0, + "step": 5221 + }, + { + "epoch": 0.5734680430485394, + "grad_norm": 2.198549747467041, + "learning_rate": 1e-06, + "loss": 1.0165, + "mean_token_accuracy": 0.6956659555435181, + "num_tokens": 132495273.0, + "step": 5222 + }, + { + "epoch": 0.5735778607511531, + "grad_norm": 2.664716958999634, + "learning_rate": 1e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7126469612121582, + "num_tokens": 132514183.0, + "step": 5223 + }, + { + "epoch": 0.5736876784537668, + "grad_norm": 2.2340235710144043, + "learning_rate": 1e-06, + "loss": 1.045, + "mean_token_accuracy": 0.6911954283714294, + "num_tokens": 132541486.0, + "step": 5224 + }, + { + "epoch": 0.5737974961563804, + "grad_norm": 2.010831594467163, + "learning_rate": 1e-06, + "loss": 1.079, + "mean_token_accuracy": 0.6759300231933594, + "num_tokens": 132572121.0, + "step": 5225 + }, + { + "epoch": 0.5739073138589941, + "grad_norm": 2.1498916149139404, + "learning_rate": 1e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.7052387595176697, + "num_tokens": 132600268.0, + "step": 5226 + }, + { + "epoch": 0.5740171315616077, + "grad_norm": 2.378607988357544, + "learning_rate": 1e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7156311273574829, + "num_tokens": 132622287.0, + "step": 5227 + }, + { + "epoch": 0.5741269492642214, + "grad_norm": 2.111144781112671, + "learning_rate": 1e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.6917753219604492, + "num_tokens": 132651156.0, + "step": 5228 + }, + { + "epoch": 0.574236766966835, + "grad_norm": 2.4752094745635986, + "learning_rate": 1e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7054749727249146, + "num_tokens": 132671813.0, + "step": 5229 + }, + { + "epoch": 0.5743465846694488, + "grad_norm": 1.9972420930862427, + "learning_rate": 1e-06, + "loss": 1.0504, + "mean_token_accuracy": 0.6881874799728394, + "num_tokens": 132704349.0, + "step": 5230 + }, + { + "epoch": 0.5744564023720624, + "grad_norm": 2.268953323364258, + "learning_rate": 1e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.7076601982116699, + "num_tokens": 132729685.0, + "step": 5231 + }, + { + "epoch": 0.5745662200746761, + "grad_norm": 2.387075901031494, + "learning_rate": 1e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.7157081365585327, + "num_tokens": 132750555.0, + "step": 5232 + }, + { + "epoch": 0.5746760377772897, + "grad_norm": 2.3038339614868164, + "learning_rate": 1e-06, + "loss": 1.0586, + "mean_token_accuracy": 0.68282151222229, + "num_tokens": 132774570.0, + "step": 5233 + }, + { + "epoch": 0.5747858554799034, + "grad_norm": 2.0103211402893066, + "learning_rate": 1e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.7099185585975647, + "num_tokens": 132805117.0, + "step": 5234 + }, + { + "epoch": 0.574895673182517, + "grad_norm": 2.281106948852539, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7118721008300781, + "num_tokens": 132829082.0, + "step": 5235 + }, + { + "epoch": 0.5750054908851306, + "grad_norm": 2.0314154624938965, + "learning_rate": 1e-06, + "loss": 0.9085, + "mean_token_accuracy": 0.7158993482589722, + "num_tokens": 132857333.0, + "step": 5236 + }, + { + "epoch": 0.5751153085877443, + "grad_norm": 2.521620273590088, + "learning_rate": 1e-06, + "loss": 0.9234, + "mean_token_accuracy": 0.7128307819366455, + "num_tokens": 132876767.0, + "step": 5237 + }, + { + "epoch": 0.575225126290358, + "grad_norm": 1.9884833097457886, + "learning_rate": 1e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.7072185277938843, + "num_tokens": 132906608.0, + "step": 5238 + }, + { + "epoch": 0.5753349439929717, + "grad_norm": 2.264435291290283, + "learning_rate": 1e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.7019309997558594, + "num_tokens": 132931484.0, + "step": 5239 + }, + { + "epoch": 0.5754447616955853, + "grad_norm": 2.011446237564087, + "learning_rate": 1e-06, + "loss": 0.9768, + "mean_token_accuracy": 0.6986514925956726, + "num_tokens": 132962810.0, + "step": 5240 + }, + { + "epoch": 0.575554579398199, + "grad_norm": 2.433492660522461, + "learning_rate": 1e-06, + "loss": 0.9704, + "mean_token_accuracy": 0.7068790197372437, + "num_tokens": 132984231.0, + "step": 5241 + }, + { + "epoch": 0.5756643971008126, + "grad_norm": 1.8703267574310303, + "learning_rate": 1e-06, + "loss": 1.0449, + "mean_token_accuracy": 0.6857808828353882, + "num_tokens": 133018409.0, + "step": 5242 + }, + { + "epoch": 0.5757742148034263, + "grad_norm": 2.3908486366271973, + "learning_rate": 1e-06, + "loss": 1.0208, + "mean_token_accuracy": 0.6994686722755432, + "num_tokens": 133043100.0, + "step": 5243 + }, + { + "epoch": 0.5758840325060399, + "grad_norm": 2.4763741493225098, + "learning_rate": 1e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.7248556613922119, + "num_tokens": 133062689.0, + "step": 5244 + }, + { + "epoch": 0.5759938502086537, + "grad_norm": 2.353928804397583, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7119991779327393, + "num_tokens": 133085114.0, + "step": 5245 + }, + { + "epoch": 0.5761036679112673, + "grad_norm": 2.2241690158843994, + "learning_rate": 1e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.7249348163604736, + "num_tokens": 133108558.0, + "step": 5246 + }, + { + "epoch": 0.576213485613881, + "grad_norm": 2.3427603244781494, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7103067636489868, + "num_tokens": 133131371.0, + "step": 5247 + }, + { + "epoch": 0.5763233033164946, + "grad_norm": 1.9843292236328125, + "learning_rate": 1e-06, + "loss": 1.0102, + "mean_token_accuracy": 0.700782060623169, + "num_tokens": 133160745.0, + "step": 5248 + }, + { + "epoch": 0.5764331210191083, + "grad_norm": 2.0024871826171875, + "learning_rate": 1e-06, + "loss": 1.0289, + "mean_token_accuracy": 0.6879557371139526, + "num_tokens": 133193141.0, + "step": 5249 + }, + { + "epoch": 0.5765429387217219, + "grad_norm": 2.368863821029663, + "learning_rate": 1e-06, + "loss": 0.9802, + "mean_token_accuracy": 0.7043725848197937, + "num_tokens": 133217013.0, + "step": 5250 + }, + { + "epoch": 0.5766527564243356, + "grad_norm": 2.2485783100128174, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7070867419242859, + "num_tokens": 133240781.0, + "step": 5251 + }, + { + "epoch": 0.5767625741269493, + "grad_norm": 2.232553005218506, + "learning_rate": 1e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.6987704038619995, + "num_tokens": 133264345.0, + "step": 5252 + }, + { + "epoch": 0.576872391829563, + "grad_norm": 2.117671251296997, + "learning_rate": 1e-06, + "loss": 1.0457, + "mean_token_accuracy": 0.6963802576065063, + "num_tokens": 133292193.0, + "step": 5253 + }, + { + "epoch": 0.5769822095321766, + "grad_norm": 2.4109745025634766, + "learning_rate": 1e-06, + "loss": 1.0315, + "mean_token_accuracy": 0.7045350074768066, + "num_tokens": 133316545.0, + "step": 5254 + }, + { + "epoch": 0.5770920272347903, + "grad_norm": 2.337094306945801, + "learning_rate": 1e-06, + "loss": 1.0163, + "mean_token_accuracy": 0.6897042989730835, + "num_tokens": 133340104.0, + "step": 5255 + }, + { + "epoch": 0.5772018449374039, + "grad_norm": 2.2110846042633057, + "learning_rate": 1e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.7041777968406677, + "num_tokens": 133365097.0, + "step": 5256 + }, + { + "epoch": 0.5773116626400175, + "grad_norm": 2.521847724914551, + "learning_rate": 1e-06, + "loss": 0.9037, + "mean_token_accuracy": 0.7187705636024475, + "num_tokens": 133386529.0, + "step": 5257 + }, + { + "epoch": 0.5774214803426312, + "grad_norm": 2.253798723220825, + "learning_rate": 1e-06, + "loss": 0.978, + "mean_token_accuracy": 0.7028258442878723, + "num_tokens": 133412857.0, + "step": 5258 + }, + { + "epoch": 0.577531298045245, + "grad_norm": 2.261258125305176, + "learning_rate": 1e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.719629168510437, + "num_tokens": 133437679.0, + "step": 5259 + }, + { + "epoch": 0.5776411157478586, + "grad_norm": 1.9988096952438354, + "learning_rate": 1e-06, + "loss": 0.9138, + "mean_token_accuracy": 0.7180655002593994, + "num_tokens": 133466401.0, + "step": 5260 + }, + { + "epoch": 0.5777509334504722, + "grad_norm": 2.001199245452881, + "learning_rate": 1e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.6978698968887329, + "num_tokens": 133496069.0, + "step": 5261 + }, + { + "epoch": 0.5778607511530859, + "grad_norm": 2.782827615737915, + "learning_rate": 1e-06, + "loss": 0.8612, + "mean_token_accuracy": 0.733099639415741, + "num_tokens": 133512914.0, + "step": 5262 + }, + { + "epoch": 0.5779705688556995, + "grad_norm": 2.163421154022217, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.7180659174919128, + "num_tokens": 133538597.0, + "step": 5263 + }, + { + "epoch": 0.5780803865583132, + "grad_norm": 2.3467862606048584, + "learning_rate": 1e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.7034728527069092, + "num_tokens": 133561176.0, + "step": 5264 + }, + { + "epoch": 0.5781902042609268, + "grad_norm": 2.6259756088256836, + "learning_rate": 1e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.705681324005127, + "num_tokens": 133580523.0, + "step": 5265 + }, + { + "epoch": 0.5783000219635406, + "grad_norm": 2.1590824127197266, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.703009843826294, + "num_tokens": 133609222.0, + "step": 5266 + }, + { + "epoch": 0.5784098396661542, + "grad_norm": 2.326582670211792, + "learning_rate": 1e-06, + "loss": 1.1158, + "mean_token_accuracy": 0.6770869493484497, + "num_tokens": 133634987.0, + "step": 5267 + }, + { + "epoch": 0.5785196573687679, + "grad_norm": 1.9586158990859985, + "learning_rate": 1e-06, + "loss": 0.9865, + "mean_token_accuracy": 0.7044089436531067, + "num_tokens": 133667063.0, + "step": 5268 + }, + { + "epoch": 0.5786294750713815, + "grad_norm": 2.6178627014160156, + "learning_rate": 1e-06, + "loss": 1.0232, + "mean_token_accuracy": 0.6863992214202881, + "num_tokens": 133688101.0, + "step": 5269 + }, + { + "epoch": 0.5787392927739952, + "grad_norm": 2.3034298419952393, + "learning_rate": 1e-06, + "loss": 1.0702, + "mean_token_accuracy": 0.6824853420257568, + "num_tokens": 133711803.0, + "step": 5270 + }, + { + "epoch": 0.5788491104766088, + "grad_norm": 2.161839008331299, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.699673593044281, + "num_tokens": 133738718.0, + "step": 5271 + }, + { + "epoch": 0.5789589281792225, + "grad_norm": 2.2121665477752686, + "learning_rate": 1e-06, + "loss": 0.962, + "mean_token_accuracy": 0.7094592452049255, + "num_tokens": 133763709.0, + "step": 5272 + }, + { + "epoch": 0.5790687458818361, + "grad_norm": 2.0512263774871826, + "learning_rate": 1e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.6999998092651367, + "num_tokens": 133791726.0, + "step": 5273 + }, + { + "epoch": 0.5791785635844499, + "grad_norm": 2.470515012741089, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.721735417842865, + "num_tokens": 133813223.0, + "step": 5274 + }, + { + "epoch": 0.5792883812870635, + "grad_norm": 2.3768038749694824, + "learning_rate": 1e-06, + "loss": 0.8394, + "mean_token_accuracy": 0.7304153442382812, + "num_tokens": 133834525.0, + "step": 5275 + }, + { + "epoch": 0.5793981989896771, + "grad_norm": 2.435854434967041, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7161340713500977, + "num_tokens": 133854818.0, + "step": 5276 + }, + { + "epoch": 0.5795080166922908, + "grad_norm": 2.3381118774414062, + "learning_rate": 1e-06, + "loss": 1.0068, + "mean_token_accuracy": 0.6939873695373535, + "num_tokens": 133878049.0, + "step": 5277 + }, + { + "epoch": 0.5796178343949044, + "grad_norm": 2.366504430770874, + "learning_rate": 1e-06, + "loss": 0.9253, + "mean_token_accuracy": 0.7169451117515564, + "num_tokens": 133899991.0, + "step": 5278 + }, + { + "epoch": 0.5797276520975181, + "grad_norm": 2.057523250579834, + "learning_rate": 1e-06, + "loss": 1.0085, + "mean_token_accuracy": 0.6929991245269775, + "num_tokens": 133928277.0, + "step": 5279 + }, + { + "epoch": 0.5798374698001317, + "grad_norm": 2.4262216091156006, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.700539231300354, + "num_tokens": 133951243.0, + "step": 5280 + }, + { + "epoch": 0.5799472875027455, + "grad_norm": 2.1406710147857666, + "learning_rate": 1e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.7084846496582031, + "num_tokens": 133977310.0, + "step": 5281 + }, + { + "epoch": 0.5800571052053591, + "grad_norm": 2.271547555923462, + "learning_rate": 1e-06, + "loss": 0.8651, + "mean_token_accuracy": 0.7275773286819458, + "num_tokens": 133999640.0, + "step": 5282 + }, + { + "epoch": 0.5801669229079728, + "grad_norm": 2.579116106033325, + "learning_rate": 1e-06, + "loss": 0.966, + "mean_token_accuracy": 0.710374116897583, + "num_tokens": 134020033.0, + "step": 5283 + }, + { + "epoch": 0.5802767406105864, + "grad_norm": 2.2821857929229736, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.7063003778457642, + "num_tokens": 134044272.0, + "step": 5284 + }, + { + "epoch": 0.5803865583132001, + "grad_norm": 2.176669120788574, + "learning_rate": 1e-06, + "loss": 1.0344, + "mean_token_accuracy": 0.6923031210899353, + "num_tokens": 134071481.0, + "step": 5285 + }, + { + "epoch": 0.5804963760158137, + "grad_norm": 2.1508359909057617, + "learning_rate": 1e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.7116300463676453, + "num_tokens": 134097117.0, + "step": 5286 + }, + { + "epoch": 0.5806061937184274, + "grad_norm": 2.2741692066192627, + "learning_rate": 1e-06, + "loss": 1.0218, + "mean_token_accuracy": 0.6925932168960571, + "num_tokens": 134122507.0, + "step": 5287 + }, + { + "epoch": 0.5807160114210411, + "grad_norm": 2.11126708984375, + "learning_rate": 1e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.7017372846603394, + "num_tokens": 134149607.0, + "step": 5288 + }, + { + "epoch": 0.5808258291236548, + "grad_norm": 2.2705705165863037, + "learning_rate": 1e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.7279496788978577, + "num_tokens": 134173914.0, + "step": 5289 + }, + { + "epoch": 0.5809356468262684, + "grad_norm": 2.0476884841918945, + "learning_rate": 1e-06, + "loss": 1.0681, + "mean_token_accuracy": 0.6729558706283569, + "num_tokens": 134202540.0, + "step": 5290 + }, + { + "epoch": 0.5810454645288821, + "grad_norm": 2.103874683380127, + "learning_rate": 1e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.6961624026298523, + "num_tokens": 134229716.0, + "step": 5291 + }, + { + "epoch": 0.5811552822314957, + "grad_norm": 2.0736753940582275, + "learning_rate": 1e-06, + "loss": 0.8519, + "mean_token_accuracy": 0.7325377464294434, + "num_tokens": 134256650.0, + "step": 5292 + }, + { + "epoch": 0.5812650999341094, + "grad_norm": 2.366046905517578, + "learning_rate": 1e-06, + "loss": 0.9454, + "mean_token_accuracy": 0.7121015191078186, + "num_tokens": 134279509.0, + "step": 5293 + }, + { + "epoch": 0.581374917636723, + "grad_norm": 2.079604387283325, + "learning_rate": 1e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.7229758501052856, + "num_tokens": 134306192.0, + "step": 5294 + }, + { + "epoch": 0.5814847353393368, + "grad_norm": 2.1072893142700195, + "learning_rate": 1e-06, + "loss": 1.0313, + "mean_token_accuracy": 0.7024677395820618, + "num_tokens": 134335294.0, + "step": 5295 + }, + { + "epoch": 0.5815945530419504, + "grad_norm": 2.0885818004608154, + "learning_rate": 1e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.7026857733726501, + "num_tokens": 134361525.0, + "step": 5296 + }, + { + "epoch": 0.581704370744564, + "grad_norm": 2.1041512489318848, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7113806009292603, + "num_tokens": 134389692.0, + "step": 5297 + }, + { + "epoch": 0.5818141884471777, + "grad_norm": 2.423011541366577, + "learning_rate": 1e-06, + "loss": 0.9286, + "mean_token_accuracy": 0.7152559161186218, + "num_tokens": 134411581.0, + "step": 5298 + }, + { + "epoch": 0.5819240061497913, + "grad_norm": 2.2717249393463135, + "learning_rate": 1e-06, + "loss": 1.0066, + "mean_token_accuracy": 0.6947294473648071, + "num_tokens": 134437198.0, + "step": 5299 + }, + { + "epoch": 0.582033823852405, + "grad_norm": 2.264054775238037, + "learning_rate": 1e-06, + "loss": 1.0058, + "mean_token_accuracy": 0.6986645460128784, + "num_tokens": 134462045.0, + "step": 5300 + }, + { + "epoch": 0.5821436415550186, + "grad_norm": 1.9386764764785767, + "learning_rate": 1e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.7303282022476196, + "num_tokens": 134491150.0, + "step": 5301 + }, + { + "epoch": 0.5822534592576323, + "grad_norm": 2.0090019702911377, + "learning_rate": 1e-06, + "loss": 1.0221, + "mean_token_accuracy": 0.6943742036819458, + "num_tokens": 134519820.0, + "step": 5302 + }, + { + "epoch": 0.582363276960246, + "grad_norm": 2.1604926586151123, + "learning_rate": 1e-06, + "loss": 0.983, + "mean_token_accuracy": 0.6964572668075562, + "num_tokens": 134545661.0, + "step": 5303 + }, + { + "epoch": 0.5824730946628597, + "grad_norm": 2.121795892715454, + "learning_rate": 1e-06, + "loss": 0.8922, + "mean_token_accuracy": 0.7288070321083069, + "num_tokens": 134571371.0, + "step": 5304 + }, + { + "epoch": 0.5825829123654733, + "grad_norm": 2.1336164474487305, + "learning_rate": 1e-06, + "loss": 1.068, + "mean_token_accuracy": 0.6853584051132202, + "num_tokens": 134599707.0, + "step": 5305 + }, + { + "epoch": 0.582692730068087, + "grad_norm": 2.165975570678711, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7060019969940186, + "num_tokens": 134624537.0, + "step": 5306 + }, + { + "epoch": 0.5828025477707006, + "grad_norm": 2.1158013343811035, + "learning_rate": 1e-06, + "loss": 1.0552, + "mean_token_accuracy": 0.6787488460540771, + "num_tokens": 134651005.0, + "step": 5307 + }, + { + "epoch": 0.5829123654733143, + "grad_norm": 2.043452262878418, + "learning_rate": 1e-06, + "loss": 1.0848, + "mean_token_accuracy": 0.6748039722442627, + "num_tokens": 134681007.0, + "step": 5308 + }, + { + "epoch": 0.5830221831759279, + "grad_norm": 2.1336684226989746, + "learning_rate": 1e-06, + "loss": 1.0837, + "mean_token_accuracy": 0.6779297590255737, + "num_tokens": 134709497.0, + "step": 5309 + }, + { + "epoch": 0.5831320008785417, + "grad_norm": 2.4735143184661865, + "learning_rate": 1e-06, + "loss": 0.9097, + "mean_token_accuracy": 0.7178666591644287, + "num_tokens": 134729215.0, + "step": 5310 + }, + { + "epoch": 0.5832418185811553, + "grad_norm": 2.1289377212524414, + "learning_rate": 1e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.7019673585891724, + "num_tokens": 134755564.0, + "step": 5311 + }, + { + "epoch": 0.583351636283769, + "grad_norm": 2.035707950592041, + "learning_rate": 1e-06, + "loss": 0.9594, + "mean_token_accuracy": 0.7050561904907227, + "num_tokens": 134784573.0, + "step": 5312 + }, + { + "epoch": 0.5834614539863826, + "grad_norm": 2.361295223236084, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7076578736305237, + "num_tokens": 134807311.0, + "step": 5313 + }, + { + "epoch": 0.5835712716889963, + "grad_norm": 2.1787776947021484, + "learning_rate": 1e-06, + "loss": 1.0047, + "mean_token_accuracy": 0.6998717784881592, + "num_tokens": 134833012.0, + "step": 5314 + }, + { + "epoch": 0.5836810893916099, + "grad_norm": 2.0514588356018066, + "learning_rate": 1e-06, + "loss": 1.0473, + "mean_token_accuracy": 0.6874218583106995, + "num_tokens": 134860961.0, + "step": 5315 + }, + { + "epoch": 0.5837909070942235, + "grad_norm": 2.546738862991333, + "learning_rate": 1e-06, + "loss": 1.0554, + "mean_token_accuracy": 0.6874744892120361, + "num_tokens": 134881393.0, + "step": 5316 + }, + { + "epoch": 0.5839007247968373, + "grad_norm": 2.373377561569214, + "learning_rate": 1e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.7058731317520142, + "num_tokens": 134902656.0, + "step": 5317 + }, + { + "epoch": 0.584010542499451, + "grad_norm": 2.2139334678649902, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.7048686742782593, + "num_tokens": 134927717.0, + "step": 5318 + }, + { + "epoch": 0.5841203602020646, + "grad_norm": 2.1015286445617676, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7229368686676025, + "num_tokens": 134953651.0, + "step": 5319 + }, + { + "epoch": 0.5842301779046782, + "grad_norm": 2.2112419605255127, + "learning_rate": 1e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.7010471224784851, + "num_tokens": 134977183.0, + "step": 5320 + }, + { + "epoch": 0.5843399956072919, + "grad_norm": 2.059896230697632, + "learning_rate": 1e-06, + "loss": 0.99, + "mean_token_accuracy": 0.697642982006073, + "num_tokens": 135005234.0, + "step": 5321 + }, + { + "epoch": 0.5844498133099055, + "grad_norm": 2.308072566986084, + "learning_rate": 1e-06, + "loss": 1.0215, + "mean_token_accuracy": 0.692697286605835, + "num_tokens": 135031078.0, + "step": 5322 + }, + { + "epoch": 0.5845596310125192, + "grad_norm": 1.81059730052948, + "learning_rate": 1e-06, + "loss": 1.074, + "mean_token_accuracy": 0.680100679397583, + "num_tokens": 135066512.0, + "step": 5323 + }, + { + "epoch": 0.5846694487151329, + "grad_norm": 1.9378100633621216, + "learning_rate": 1e-06, + "loss": 1.0353, + "mean_token_accuracy": 0.6864659190177917, + "num_tokens": 135098685.0, + "step": 5324 + }, + { + "epoch": 0.5847792664177466, + "grad_norm": 1.96377432346344, + "learning_rate": 1e-06, + "loss": 1.0399, + "mean_token_accuracy": 0.6919578313827515, + "num_tokens": 135130568.0, + "step": 5325 + }, + { + "epoch": 0.5848890841203602, + "grad_norm": 2.3124966621398926, + "learning_rate": 1e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.7147501111030579, + "num_tokens": 135152927.0, + "step": 5326 + }, + { + "epoch": 0.5849989018229739, + "grad_norm": 2.414138078689575, + "learning_rate": 1e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7131749391555786, + "num_tokens": 135175186.0, + "step": 5327 + }, + { + "epoch": 0.5851087195255875, + "grad_norm": 2.268028736114502, + "learning_rate": 1e-06, + "loss": 0.9446, + "mean_token_accuracy": 0.7095658779144287, + "num_tokens": 135198916.0, + "step": 5328 + }, + { + "epoch": 0.5852185372282012, + "grad_norm": 2.4007551670074463, + "learning_rate": 1e-06, + "loss": 0.9002, + "mean_token_accuracy": 0.7262096405029297, + "num_tokens": 135219459.0, + "step": 5329 + }, + { + "epoch": 0.5853283549308148, + "grad_norm": 2.4372851848602295, + "learning_rate": 1e-06, + "loss": 1.0304, + "mean_token_accuracy": 0.6864607930183411, + "num_tokens": 135242312.0, + "step": 5330 + }, + { + "epoch": 0.5854381726334285, + "grad_norm": 2.248338460922241, + "learning_rate": 1e-06, + "loss": 0.8683, + "mean_token_accuracy": 0.7259669303894043, + "num_tokens": 135265193.0, + "step": 5331 + }, + { + "epoch": 0.5855479903360422, + "grad_norm": 2.2977545261383057, + "learning_rate": 1e-06, + "loss": 0.8649, + "mean_token_accuracy": 0.7360988259315491, + "num_tokens": 135288659.0, + "step": 5332 + }, + { + "epoch": 0.5856578080386559, + "grad_norm": 2.0379018783569336, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.7024803161621094, + "num_tokens": 135316940.0, + "step": 5333 + }, + { + "epoch": 0.5857676257412695, + "grad_norm": 2.444532632827759, + "learning_rate": 1e-06, + "loss": 0.8516, + "mean_token_accuracy": 0.7294316291809082, + "num_tokens": 135336231.0, + "step": 5334 + }, + { + "epoch": 0.5858774434438832, + "grad_norm": 2.2444467544555664, + "learning_rate": 1e-06, + "loss": 1.0314, + "mean_token_accuracy": 0.69367516040802, + "num_tokens": 135363309.0, + "step": 5335 + }, + { + "epoch": 0.5859872611464968, + "grad_norm": 2.3661410808563232, + "learning_rate": 1e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.6952400803565979, + "num_tokens": 135386888.0, + "step": 5336 + }, + { + "epoch": 0.5860970788491104, + "grad_norm": 2.015108108520508, + "learning_rate": 1e-06, + "loss": 1.0483, + "mean_token_accuracy": 0.678848147392273, + "num_tokens": 135417378.0, + "step": 5337 + }, + { + "epoch": 0.5862068965517241, + "grad_norm": 2.3119020462036133, + "learning_rate": 1e-06, + "loss": 0.8582, + "mean_token_accuracy": 0.7328635454177856, + "num_tokens": 135440079.0, + "step": 5338 + }, + { + "epoch": 0.5863167142543378, + "grad_norm": 2.184389352798462, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.6977382302284241, + "num_tokens": 135467148.0, + "step": 5339 + }, + { + "epoch": 0.5864265319569515, + "grad_norm": 1.9321072101593018, + "learning_rate": 1e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.7064551115036011, + "num_tokens": 135496381.0, + "step": 5340 + }, + { + "epoch": 0.5865363496595651, + "grad_norm": 2.2017407417297363, + "learning_rate": 1e-06, + "loss": 0.9782, + "mean_token_accuracy": 0.7036951780319214, + "num_tokens": 135522143.0, + "step": 5341 + }, + { + "epoch": 0.5866461673621788, + "grad_norm": 2.3259363174438477, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.6888774633407593, + "num_tokens": 135544666.0, + "step": 5342 + }, + { + "epoch": 0.5867559850647924, + "grad_norm": 2.103386163711548, + "learning_rate": 1e-06, + "loss": 0.978, + "mean_token_accuracy": 0.6995185613632202, + "num_tokens": 135573534.0, + "step": 5343 + }, + { + "epoch": 0.5868658027674061, + "grad_norm": 2.010856866836548, + "learning_rate": 1e-06, + "loss": 1.0043, + "mean_token_accuracy": 0.6928291320800781, + "num_tokens": 135604791.0, + "step": 5344 + }, + { + "epoch": 0.5869756204700197, + "grad_norm": 2.170180320739746, + "learning_rate": 1e-06, + "loss": 0.8781, + "mean_token_accuracy": 0.7343120574951172, + "num_tokens": 135629623.0, + "step": 5345 + }, + { + "epoch": 0.5870854381726335, + "grad_norm": 2.4939627647399902, + "learning_rate": 1e-06, + "loss": 0.8689, + "mean_token_accuracy": 0.7282553315162659, + "num_tokens": 135648881.0, + "step": 5346 + }, + { + "epoch": 0.5871952558752471, + "grad_norm": 2.45862078666687, + "learning_rate": 1e-06, + "loss": 0.982, + "mean_token_accuracy": 0.7079036235809326, + "num_tokens": 135670903.0, + "step": 5347 + }, + { + "epoch": 0.5873050735778608, + "grad_norm": 2.0855026245117188, + "learning_rate": 1e-06, + "loss": 1.077, + "mean_token_accuracy": 0.677950382232666, + "num_tokens": 135701610.0, + "step": 5348 + }, + { + "epoch": 0.5874148912804744, + "grad_norm": 2.0358619689941406, + "learning_rate": 1e-06, + "loss": 1.0051, + "mean_token_accuracy": 0.69654381275177, + "num_tokens": 135730707.0, + "step": 5349 + }, + { + "epoch": 0.5875247089830881, + "grad_norm": 2.045050859451294, + "learning_rate": 1e-06, + "loss": 1.1104, + "mean_token_accuracy": 0.666887640953064, + "num_tokens": 135760981.0, + "step": 5350 + }, + { + "epoch": 0.5876345266857017, + "grad_norm": 2.056560754776001, + "learning_rate": 1e-06, + "loss": 0.94, + "mean_token_accuracy": 0.7133487462997437, + "num_tokens": 135789303.0, + "step": 5351 + }, + { + "epoch": 0.5877443443883154, + "grad_norm": 2.32171893119812, + "learning_rate": 1e-06, + "loss": 0.8798, + "mean_token_accuracy": 0.7242007255554199, + "num_tokens": 135810776.0, + "step": 5352 + }, + { + "epoch": 0.5878541620909291, + "grad_norm": 2.1379990577697754, + "learning_rate": 1e-06, + "loss": 1.0903, + "mean_token_accuracy": 0.6929370164871216, + "num_tokens": 135836668.0, + "step": 5353 + }, + { + "epoch": 0.5879639797935428, + "grad_norm": 2.090125322341919, + "learning_rate": 1e-06, + "loss": 1.0708, + "mean_token_accuracy": 0.6758311986923218, + "num_tokens": 135864736.0, + "step": 5354 + }, + { + "epoch": 0.5880737974961564, + "grad_norm": 2.15268611907959, + "learning_rate": 1e-06, + "loss": 0.9865, + "mean_token_accuracy": 0.703617513179779, + "num_tokens": 135890029.0, + "step": 5355 + }, + { + "epoch": 0.58818361519877, + "grad_norm": 2.1933631896972656, + "learning_rate": 1e-06, + "loss": 1.0106, + "mean_token_accuracy": 0.6965094804763794, + "num_tokens": 135914682.0, + "step": 5356 + }, + { + "epoch": 0.5882934329013837, + "grad_norm": 2.59837007522583, + "learning_rate": 1e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.7193707823753357, + "num_tokens": 135933958.0, + "step": 5357 + }, + { + "epoch": 0.5884032506039973, + "grad_norm": 2.3280839920043945, + "learning_rate": 1e-06, + "loss": 1.0064, + "mean_token_accuracy": 0.7041277289390564, + "num_tokens": 135958154.0, + "step": 5358 + }, + { + "epoch": 0.588513068306611, + "grad_norm": 2.580996036529541, + "learning_rate": 1e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.7054991722106934, + "num_tokens": 135978503.0, + "step": 5359 + }, + { + "epoch": 0.5886228860092246, + "grad_norm": 2.137294054031372, + "learning_rate": 1e-06, + "loss": 1.0299, + "mean_token_accuracy": 0.6962557435035706, + "num_tokens": 136006603.0, + "step": 5360 + }, + { + "epoch": 0.5887327037118384, + "grad_norm": 2.040471315383911, + "learning_rate": 1e-06, + "loss": 0.8228, + "mean_token_accuracy": 0.7472839951515198, + "num_tokens": 136033490.0, + "step": 5361 + }, + { + "epoch": 0.588842521414452, + "grad_norm": 2.363517999649048, + "learning_rate": 1e-06, + "loss": 0.9567, + "mean_token_accuracy": 0.7026844024658203, + "num_tokens": 136055274.0, + "step": 5362 + }, + { + "epoch": 0.5889523391170657, + "grad_norm": 2.088432788848877, + "learning_rate": 1e-06, + "loss": 0.9323, + "mean_token_accuracy": 0.7146603465080261, + "num_tokens": 136081709.0, + "step": 5363 + }, + { + "epoch": 0.5890621568196793, + "grad_norm": 2.3725972175598145, + "learning_rate": 1e-06, + "loss": 0.8858, + "mean_token_accuracy": 0.7201924324035645, + "num_tokens": 136101410.0, + "step": 5364 + }, + { + "epoch": 0.589171974522293, + "grad_norm": 2.3790218830108643, + "learning_rate": 1e-06, + "loss": 0.8639, + "mean_token_accuracy": 0.727538526058197, + "num_tokens": 136122588.0, + "step": 5365 + }, + { + "epoch": 0.5892817922249066, + "grad_norm": 2.384206771850586, + "learning_rate": 1e-06, + "loss": 0.8535, + "mean_token_accuracy": 0.7382428646087646, + "num_tokens": 136143878.0, + "step": 5366 + }, + { + "epoch": 0.5893916099275203, + "grad_norm": 2.5817954540252686, + "learning_rate": 1e-06, + "loss": 0.9568, + "mean_token_accuracy": 0.7106072306632996, + "num_tokens": 136163020.0, + "step": 5367 + }, + { + "epoch": 0.589501427630134, + "grad_norm": 2.207455635070801, + "learning_rate": 1e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.712073802947998, + "num_tokens": 136188276.0, + "step": 5368 + }, + { + "epoch": 0.5896112453327477, + "grad_norm": 2.2021701335906982, + "learning_rate": 1e-06, + "loss": 1.019, + "mean_token_accuracy": 0.6915698647499084, + "num_tokens": 136213333.0, + "step": 5369 + }, + { + "epoch": 0.5897210630353613, + "grad_norm": 2.537984848022461, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.7141621112823486, + "num_tokens": 136233361.0, + "step": 5370 + }, + { + "epoch": 0.589830880737975, + "grad_norm": 2.031512975692749, + "learning_rate": 1e-06, + "loss": 1.002, + "mean_token_accuracy": 0.7033877372741699, + "num_tokens": 136262337.0, + "step": 5371 + }, + { + "epoch": 0.5899406984405886, + "grad_norm": 2.920125722885132, + "learning_rate": 1e-06, + "loss": 0.8936, + "mean_token_accuracy": 0.7257758378982544, + "num_tokens": 136277290.0, + "step": 5372 + }, + { + "epoch": 0.5900505161432023, + "grad_norm": 2.0800065994262695, + "learning_rate": 1e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.697900116443634, + "num_tokens": 136307146.0, + "step": 5373 + }, + { + "epoch": 0.5901603338458159, + "grad_norm": 2.0810067653656006, + "learning_rate": 1e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.7247147560119629, + "num_tokens": 136333239.0, + "step": 5374 + }, + { + "epoch": 0.5902701515484297, + "grad_norm": 2.1578404903411865, + "learning_rate": 1e-06, + "loss": 0.9954, + "mean_token_accuracy": 0.6960090398788452, + "num_tokens": 136361282.0, + "step": 5375 + }, + { + "epoch": 0.5903799692510433, + "grad_norm": 2.320922374725342, + "learning_rate": 1e-06, + "loss": 1.0095, + "mean_token_accuracy": 0.6964601278305054, + "num_tokens": 136387075.0, + "step": 5376 + }, + { + "epoch": 0.590489786953657, + "grad_norm": 2.035386085510254, + "learning_rate": 1e-06, + "loss": 0.9352, + "mean_token_accuracy": 0.7072829008102417, + "num_tokens": 136414477.0, + "step": 5377 + }, + { + "epoch": 0.5905996046562706, + "grad_norm": 2.101747512817383, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7047562003135681, + "num_tokens": 136441797.0, + "step": 5378 + }, + { + "epoch": 0.5907094223588842, + "grad_norm": 2.1089677810668945, + "learning_rate": 1e-06, + "loss": 1.0077, + "mean_token_accuracy": 0.7013516426086426, + "num_tokens": 136469533.0, + "step": 5379 + }, + { + "epoch": 0.5908192400614979, + "grad_norm": 2.0546233654022217, + "learning_rate": 1e-06, + "loss": 0.9485, + "mean_token_accuracy": 0.7143845558166504, + "num_tokens": 136498634.0, + "step": 5380 + }, + { + "epoch": 0.5909290577641115, + "grad_norm": 1.8833364248275757, + "learning_rate": 1e-06, + "loss": 0.9991, + "mean_token_accuracy": 0.7026697993278503, + "num_tokens": 136530473.0, + "step": 5381 + }, + { + "epoch": 0.5910388754667253, + "grad_norm": 2.1236236095428467, + "learning_rate": 1e-06, + "loss": 0.9872, + "mean_token_accuracy": 0.7004296183586121, + "num_tokens": 136557994.0, + "step": 5382 + }, + { + "epoch": 0.5911486931693389, + "grad_norm": 2.177748441696167, + "learning_rate": 1e-06, + "loss": 1.0136, + "mean_token_accuracy": 0.6950584650039673, + "num_tokens": 136585075.0, + "step": 5383 + }, + { + "epoch": 0.5912585108719526, + "grad_norm": 2.399397850036621, + "learning_rate": 1e-06, + "loss": 0.9453, + "mean_token_accuracy": 0.7101090550422668, + "num_tokens": 136606122.0, + "step": 5384 + }, + { + "epoch": 0.5913683285745662, + "grad_norm": 2.1243114471435547, + "learning_rate": 1e-06, + "loss": 1.0435, + "mean_token_accuracy": 0.6847535371780396, + "num_tokens": 136635094.0, + "step": 5385 + }, + { + "epoch": 0.5914781462771799, + "grad_norm": 2.173292398452759, + "learning_rate": 1e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.6966434717178345, + "num_tokens": 136661888.0, + "step": 5386 + }, + { + "epoch": 0.5915879639797935, + "grad_norm": 2.2553887367248535, + "learning_rate": 1e-06, + "loss": 1.0556, + "mean_token_accuracy": 0.6813557744026184, + "num_tokens": 136685289.0, + "step": 5387 + }, + { + "epoch": 0.5916977816824072, + "grad_norm": 2.127753496170044, + "learning_rate": 1e-06, + "loss": 1.0233, + "mean_token_accuracy": 0.6963452696800232, + "num_tokens": 136712569.0, + "step": 5388 + }, + { + "epoch": 0.5918075993850208, + "grad_norm": 2.3257339000701904, + "learning_rate": 1e-06, + "loss": 0.9767, + "mean_token_accuracy": 0.6965614557266235, + "num_tokens": 136736390.0, + "step": 5389 + }, + { + "epoch": 0.5919174170876346, + "grad_norm": 2.4127278327941895, + "learning_rate": 1e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.6990807056427002, + "num_tokens": 136758149.0, + "step": 5390 + }, + { + "epoch": 0.5920272347902482, + "grad_norm": 2.1937789916992188, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7157862782478333, + "num_tokens": 136783103.0, + "step": 5391 + }, + { + "epoch": 0.5921370524928619, + "grad_norm": 2.1251790523529053, + "learning_rate": 1e-06, + "loss": 0.9586, + "mean_token_accuracy": 0.7102969884872437, + "num_tokens": 136809917.0, + "step": 5392 + }, + { + "epoch": 0.5922468701954755, + "grad_norm": 1.9215549230575562, + "learning_rate": 1e-06, + "loss": 1.0455, + "mean_token_accuracy": 0.6864724159240723, + "num_tokens": 136842358.0, + "step": 5393 + }, + { + "epoch": 0.5923566878980892, + "grad_norm": 2.115208148956299, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.7021274566650391, + "num_tokens": 136868134.0, + "step": 5394 + }, + { + "epoch": 0.5924665056007028, + "grad_norm": 2.0928351879119873, + "learning_rate": 1e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.7083809971809387, + "num_tokens": 136896357.0, + "step": 5395 + }, + { + "epoch": 0.5925763233033164, + "grad_norm": 2.4831888675689697, + "learning_rate": 1e-06, + "loss": 1.0027, + "mean_token_accuracy": 0.6932494640350342, + "num_tokens": 136918221.0, + "step": 5396 + }, + { + "epoch": 0.5926861410059302, + "grad_norm": 2.35025954246521, + "learning_rate": 1e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.7166370153427124, + "num_tokens": 136941443.0, + "step": 5397 + }, + { + "epoch": 0.5927959587085438, + "grad_norm": 2.1858882904052734, + "learning_rate": 1e-06, + "loss": 1.0184, + "mean_token_accuracy": 0.6872754693031311, + "num_tokens": 136966913.0, + "step": 5398 + }, + { + "epoch": 0.5929057764111575, + "grad_norm": 2.1280953884124756, + "learning_rate": 1e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.7118860483169556, + "num_tokens": 136993862.0, + "step": 5399 + }, + { + "epoch": 0.5930155941137711, + "grad_norm": 2.4167895317077637, + "learning_rate": 1e-06, + "loss": 1.0417, + "mean_token_accuracy": 0.6910537481307983, + "num_tokens": 137015019.0, + "step": 5400 + }, + { + "epoch": 0.5931254118163848, + "grad_norm": 2.367708444595337, + "learning_rate": 1e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.7191559076309204, + "num_tokens": 137037953.0, + "step": 5401 + }, + { + "epoch": 0.5932352295189984, + "grad_norm": 2.0849738121032715, + "learning_rate": 1e-06, + "loss": 0.962, + "mean_token_accuracy": 0.7068837881088257, + "num_tokens": 137064933.0, + "step": 5402 + }, + { + "epoch": 0.5933450472216121, + "grad_norm": 2.4071781635284424, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7141643762588501, + "num_tokens": 137086728.0, + "step": 5403 + }, + { + "epoch": 0.5934548649242258, + "grad_norm": 2.0097851753234863, + "learning_rate": 1e-06, + "loss": 0.9135, + "mean_token_accuracy": 0.7212477326393127, + "num_tokens": 137116442.0, + "step": 5404 + }, + { + "epoch": 0.5935646826268395, + "grad_norm": 2.1861157417297363, + "learning_rate": 1e-06, + "loss": 0.9979, + "mean_token_accuracy": 0.6992066502571106, + "num_tokens": 137143380.0, + "step": 5405 + }, + { + "epoch": 0.5936745003294531, + "grad_norm": 2.39678692817688, + "learning_rate": 1e-06, + "loss": 0.9933, + "mean_token_accuracy": 0.7044632434844971, + "num_tokens": 137165940.0, + "step": 5406 + }, + { + "epoch": 0.5937843180320668, + "grad_norm": 2.3236138820648193, + "learning_rate": 1e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.7197256684303284, + "num_tokens": 137188489.0, + "step": 5407 + }, + { + "epoch": 0.5938941357346804, + "grad_norm": 2.2247555255889893, + "learning_rate": 1e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.7083393335342407, + "num_tokens": 137212371.0, + "step": 5408 + }, + { + "epoch": 0.5940039534372941, + "grad_norm": 2.075685739517212, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.7014939785003662, + "num_tokens": 137239499.0, + "step": 5409 + }, + { + "epoch": 0.5941137711399077, + "grad_norm": 2.2490665912628174, + "learning_rate": 1e-06, + "loss": 1.045, + "mean_token_accuracy": 0.6814083456993103, + "num_tokens": 137263637.0, + "step": 5410 + }, + { + "epoch": 0.5942235888425215, + "grad_norm": 2.470824718475342, + "learning_rate": 1e-06, + "loss": 0.8716, + "mean_token_accuracy": 0.7237462997436523, + "num_tokens": 137281674.0, + "step": 5411 + }, + { + "epoch": 0.5943334065451351, + "grad_norm": 2.0058093070983887, + "learning_rate": 1e-06, + "loss": 0.9413, + "mean_token_accuracy": 0.7081511616706848, + "num_tokens": 137308826.0, + "step": 5412 + }, + { + "epoch": 0.5944432242477488, + "grad_norm": 2.0958633422851562, + "learning_rate": 1e-06, + "loss": 1.0389, + "mean_token_accuracy": 0.6870083212852478, + "num_tokens": 137336346.0, + "step": 5413 + }, + { + "epoch": 0.5945530419503624, + "grad_norm": 1.9868119955062866, + "learning_rate": 1e-06, + "loss": 1.0371, + "mean_token_accuracy": 0.6911208629608154, + "num_tokens": 137366861.0, + "step": 5414 + }, + { + "epoch": 0.594662859652976, + "grad_norm": 2.313311815261841, + "learning_rate": 1e-06, + "loss": 0.9356, + "mean_token_accuracy": 0.7132450342178345, + "num_tokens": 137389975.0, + "step": 5415 + }, + { + "epoch": 0.5947726773555897, + "grad_norm": 2.37825083732605, + "learning_rate": 1e-06, + "loss": 1.0558, + "mean_token_accuracy": 0.6925934553146362, + "num_tokens": 137414425.0, + "step": 5416 + }, + { + "epoch": 0.5948824950582033, + "grad_norm": 2.080374240875244, + "learning_rate": 1e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.7179035544395447, + "num_tokens": 137442599.0, + "step": 5417 + }, + { + "epoch": 0.5949923127608171, + "grad_norm": 2.262350559234619, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7076404094696045, + "num_tokens": 137465892.0, + "step": 5418 + }, + { + "epoch": 0.5951021304634307, + "grad_norm": 1.9733366966247559, + "learning_rate": 1e-06, + "loss": 0.8529, + "mean_token_accuracy": 0.7398936152458191, + "num_tokens": 137495131.0, + "step": 5419 + }, + { + "epoch": 0.5952119481660444, + "grad_norm": 2.1121695041656494, + "learning_rate": 1e-06, + "loss": 0.9323, + "mean_token_accuracy": 0.7092857360839844, + "num_tokens": 137521050.0, + "step": 5420 + }, + { + "epoch": 0.595321765868658, + "grad_norm": 2.1169726848602295, + "learning_rate": 1e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.7008150219917297, + "num_tokens": 137550602.0, + "step": 5421 + }, + { + "epoch": 0.5954315835712717, + "grad_norm": 2.252828359603882, + "learning_rate": 1e-06, + "loss": 1.082, + "mean_token_accuracy": 0.6840410828590393, + "num_tokens": 137575900.0, + "step": 5422 + }, + { + "epoch": 0.5955414012738853, + "grad_norm": 2.550611734390259, + "learning_rate": 1e-06, + "loss": 0.9982, + "mean_token_accuracy": 0.7001838684082031, + "num_tokens": 137595586.0, + "step": 5423 + }, + { + "epoch": 0.595651218976499, + "grad_norm": 2.212613344192505, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7230746746063232, + "num_tokens": 137620226.0, + "step": 5424 + }, + { + "epoch": 0.5957610366791126, + "grad_norm": 2.4549190998077393, + "learning_rate": 1e-06, + "loss": 1.0009, + "mean_token_accuracy": 0.6936026811599731, + "num_tokens": 137641679.0, + "step": 5425 + }, + { + "epoch": 0.5958708543817264, + "grad_norm": 2.0559322834014893, + "learning_rate": 1e-06, + "loss": 1.0193, + "mean_token_accuracy": 0.6947462558746338, + "num_tokens": 137670119.0, + "step": 5426 + }, + { + "epoch": 0.59598067208434, + "grad_norm": 2.2174909114837646, + "learning_rate": 1e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.7128438949584961, + "num_tokens": 137694471.0, + "step": 5427 + }, + { + "epoch": 0.5960904897869537, + "grad_norm": 2.1337547302246094, + "learning_rate": 1e-06, + "loss": 0.9635, + "mean_token_accuracy": 0.7048957347869873, + "num_tokens": 137723577.0, + "step": 5428 + }, + { + "epoch": 0.5962003074895673, + "grad_norm": 2.09616756439209, + "learning_rate": 1e-06, + "loss": 0.9982, + "mean_token_accuracy": 0.6960850954055786, + "num_tokens": 137751540.0, + "step": 5429 + }, + { + "epoch": 0.596310125192181, + "grad_norm": 2.439682960510254, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.7131720185279846, + "num_tokens": 137774288.0, + "step": 5430 + }, + { + "epoch": 0.5964199428947946, + "grad_norm": 2.3717894554138184, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.7046018242835999, + "num_tokens": 137796385.0, + "step": 5431 + }, + { + "epoch": 0.5965297605974083, + "grad_norm": 2.1455440521240234, + "learning_rate": 1e-06, + "loss": 0.9172, + "mean_token_accuracy": 0.7199028134346008, + "num_tokens": 137824561.0, + "step": 5432 + }, + { + "epoch": 0.596639578300022, + "grad_norm": 2.2519495487213135, + "learning_rate": 1e-06, + "loss": 0.9954, + "mean_token_accuracy": 0.6997621059417725, + "num_tokens": 137849474.0, + "step": 5433 + }, + { + "epoch": 0.5967493960026357, + "grad_norm": 2.1378211975097656, + "learning_rate": 1e-06, + "loss": 0.9664, + "mean_token_accuracy": 0.7154555320739746, + "num_tokens": 137875982.0, + "step": 5434 + }, + { + "epoch": 0.5968592137052493, + "grad_norm": 1.9987905025482178, + "learning_rate": 1e-06, + "loss": 0.8936, + "mean_token_accuracy": 0.7236908674240112, + "num_tokens": 137902499.0, + "step": 5435 + }, + { + "epoch": 0.596969031407863, + "grad_norm": 2.3171772956848145, + "learning_rate": 1e-06, + "loss": 1.0233, + "mean_token_accuracy": 0.6848335266113281, + "num_tokens": 137927921.0, + "step": 5436 + }, + { + "epoch": 0.5970788491104766, + "grad_norm": 2.2726173400878906, + "learning_rate": 1e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.7106627225875854, + "num_tokens": 137951993.0, + "step": 5437 + }, + { + "epoch": 0.5971886668130902, + "grad_norm": 2.1888816356658936, + "learning_rate": 1e-06, + "loss": 1.024, + "mean_token_accuracy": 0.6960647702217102, + "num_tokens": 137977462.0, + "step": 5438 + }, + { + "epoch": 0.5972984845157039, + "grad_norm": 2.204746723175049, + "learning_rate": 1e-06, + "loss": 1.1154, + "mean_token_accuracy": 0.6654497385025024, + "num_tokens": 138005029.0, + "step": 5439 + }, + { + "epoch": 0.5974083022183176, + "grad_norm": 2.2302119731903076, + "learning_rate": 1e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.6974286437034607, + "num_tokens": 138030439.0, + "step": 5440 + }, + { + "epoch": 0.5975181199209313, + "grad_norm": 2.1454660892486572, + "learning_rate": 1e-06, + "loss": 0.9504, + "mean_token_accuracy": 0.7074365615844727, + "num_tokens": 138056217.0, + "step": 5441 + }, + { + "epoch": 0.5976279376235449, + "grad_norm": 2.288992404937744, + "learning_rate": 1e-06, + "loss": 1.0427, + "mean_token_accuracy": 0.6881586909294128, + "num_tokens": 138080162.0, + "step": 5442 + }, + { + "epoch": 0.5977377553261586, + "grad_norm": 2.0697314739227295, + "learning_rate": 1e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.696535587310791, + "num_tokens": 138106497.0, + "step": 5443 + }, + { + "epoch": 0.5978475730287722, + "grad_norm": 2.373782157897949, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.702292799949646, + "num_tokens": 138128711.0, + "step": 5444 + }, + { + "epoch": 0.5979573907313859, + "grad_norm": 2.2222931385040283, + "learning_rate": 1e-06, + "loss": 1.0373, + "mean_token_accuracy": 0.6853017210960388, + "num_tokens": 138154319.0, + "step": 5445 + }, + { + "epoch": 0.5980672084339995, + "grad_norm": 1.9221608638763428, + "learning_rate": 1e-06, + "loss": 1.0263, + "mean_token_accuracy": 0.6906640529632568, + "num_tokens": 138188196.0, + "step": 5446 + }, + { + "epoch": 0.5981770261366133, + "grad_norm": 2.2308223247528076, + "learning_rate": 1e-06, + "loss": 1.0456, + "mean_token_accuracy": 0.6779054403305054, + "num_tokens": 138213470.0, + "step": 5447 + }, + { + "epoch": 0.5982868438392269, + "grad_norm": 2.395636558532715, + "learning_rate": 1e-06, + "loss": 0.9474, + "mean_token_accuracy": 0.7022042870521545, + "num_tokens": 138234551.0, + "step": 5448 + }, + { + "epoch": 0.5983966615418406, + "grad_norm": 2.1282596588134766, + "learning_rate": 1e-06, + "loss": 1.0231, + "mean_token_accuracy": 0.6924141645431519, + "num_tokens": 138263182.0, + "step": 5449 + }, + { + "epoch": 0.5985064792444542, + "grad_norm": 2.4755492210388184, + "learning_rate": 1e-06, + "loss": 0.9286, + "mean_token_accuracy": 0.7150671482086182, + "num_tokens": 138283699.0, + "step": 5450 + }, + { + "epoch": 0.5986162969470679, + "grad_norm": 2.453575372695923, + "learning_rate": 1e-06, + "loss": 0.9733, + "mean_token_accuracy": 0.6963980793952942, + "num_tokens": 138304510.0, + "step": 5451 + }, + { + "epoch": 0.5987261146496815, + "grad_norm": 2.404585838317871, + "learning_rate": 1e-06, + "loss": 0.9352, + "mean_token_accuracy": 0.7198631167411804, + "num_tokens": 138324993.0, + "step": 5452 + }, + { + "epoch": 0.5988359323522952, + "grad_norm": 2.0421743392944336, + "learning_rate": 1e-06, + "loss": 1.0206, + "mean_token_accuracy": 0.6987335085868835, + "num_tokens": 138356558.0, + "step": 5453 + }, + { + "epoch": 0.5989457500549088, + "grad_norm": 1.9278944730758667, + "learning_rate": 1e-06, + "loss": 0.9767, + "mean_token_accuracy": 0.69748455286026, + "num_tokens": 138387095.0, + "step": 5454 + }, + { + "epoch": 0.5990555677575226, + "grad_norm": 2.1350791454315186, + "learning_rate": 1e-06, + "loss": 1.0394, + "mean_token_accuracy": 0.6889566779136658, + "num_tokens": 138414375.0, + "step": 5455 + }, + { + "epoch": 0.5991653854601362, + "grad_norm": 2.102241277694702, + "learning_rate": 1e-06, + "loss": 1.0742, + "mean_token_accuracy": 0.6801394820213318, + "num_tokens": 138441682.0, + "step": 5456 + }, + { + "epoch": 0.5992752031627498, + "grad_norm": 2.0317399501800537, + "learning_rate": 1e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.7091337442398071, + "num_tokens": 138469691.0, + "step": 5457 + }, + { + "epoch": 0.5993850208653635, + "grad_norm": 2.3962674140930176, + "learning_rate": 1e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.7052939534187317, + "num_tokens": 138492524.0, + "step": 5458 + }, + { + "epoch": 0.5994948385679771, + "grad_norm": 2.0075011253356934, + "learning_rate": 1e-06, + "loss": 1.0007, + "mean_token_accuracy": 0.7011629343032837, + "num_tokens": 138521868.0, + "step": 5459 + }, + { + "epoch": 0.5996046562705908, + "grad_norm": 2.136915683746338, + "learning_rate": 1e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.7259189486503601, + "num_tokens": 138546120.0, + "step": 5460 + }, + { + "epoch": 0.5997144739732044, + "grad_norm": 1.9182075262069702, + "learning_rate": 1e-06, + "loss": 0.9789, + "mean_token_accuracy": 0.699013888835907, + "num_tokens": 138576958.0, + "step": 5461 + }, + { + "epoch": 0.5998242916758182, + "grad_norm": 2.2569456100463867, + "learning_rate": 1e-06, + "loss": 1.0228, + "mean_token_accuracy": 0.688766598701477, + "num_tokens": 138604805.0, + "step": 5462 + }, + { + "epoch": 0.5999341093784318, + "grad_norm": 2.466883420944214, + "learning_rate": 1e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7243133187294006, + "num_tokens": 138626282.0, + "step": 5463 + }, + { + "epoch": 0.6000439270810455, + "grad_norm": 2.3731110095977783, + "learning_rate": 1e-06, + "loss": 0.9924, + "mean_token_accuracy": 0.7025585174560547, + "num_tokens": 138647693.0, + "step": 5464 + }, + { + "epoch": 0.6001537447836591, + "grad_norm": 2.367459535598755, + "learning_rate": 1e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.6940805315971375, + "num_tokens": 138672699.0, + "step": 5465 + }, + { + "epoch": 0.6002635624862728, + "grad_norm": 2.1157467365264893, + "learning_rate": 1e-06, + "loss": 0.8829, + "mean_token_accuracy": 0.7245572805404663, + "num_tokens": 138699317.0, + "step": 5466 + }, + { + "epoch": 0.6003733801888864, + "grad_norm": 1.964430332183838, + "learning_rate": 1e-06, + "loss": 0.9115, + "mean_token_accuracy": 0.7266138792037964, + "num_tokens": 138730711.0, + "step": 5467 + }, + { + "epoch": 0.6004831978915001, + "grad_norm": 2.5505168437957764, + "learning_rate": 1e-06, + "loss": 0.8843, + "mean_token_accuracy": 0.7231582999229431, + "num_tokens": 138750134.0, + "step": 5468 + }, + { + "epoch": 0.6005930155941138, + "grad_norm": 2.346113443374634, + "learning_rate": 1e-06, + "loss": 1.0294, + "mean_token_accuracy": 0.6866223812103271, + "num_tokens": 138775225.0, + "step": 5469 + }, + { + "epoch": 0.6007028332967275, + "grad_norm": 2.099989414215088, + "learning_rate": 1e-06, + "loss": 1.0937, + "mean_token_accuracy": 0.6735120415687561, + "num_tokens": 138802399.0, + "step": 5470 + }, + { + "epoch": 0.6008126509993411, + "grad_norm": 2.3448245525360107, + "learning_rate": 1e-06, + "loss": 1.0366, + "mean_token_accuracy": 0.6990779042243958, + "num_tokens": 138826413.0, + "step": 5471 + }, + { + "epoch": 0.6009224687019548, + "grad_norm": 2.1126415729522705, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7112410664558411, + "num_tokens": 138852336.0, + "step": 5472 + }, + { + "epoch": 0.6010322864045684, + "grad_norm": 2.3128035068511963, + "learning_rate": 1e-06, + "loss": 1.0596, + "mean_token_accuracy": 0.6826804876327515, + "num_tokens": 138879202.0, + "step": 5473 + }, + { + "epoch": 0.601142104107182, + "grad_norm": 2.0981290340423584, + "learning_rate": 1e-06, + "loss": 0.953, + "mean_token_accuracy": 0.7107470631599426, + "num_tokens": 138904377.0, + "step": 5474 + }, + { + "epoch": 0.6012519218097957, + "grad_norm": 2.427039623260498, + "learning_rate": 1e-06, + "loss": 1.013, + "mean_token_accuracy": 0.6962885856628418, + "num_tokens": 138926348.0, + "step": 5475 + }, + { + "epoch": 0.6013617395124095, + "grad_norm": 2.456597089767456, + "learning_rate": 1e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7154520750045776, + "num_tokens": 138947655.0, + "step": 5476 + }, + { + "epoch": 0.6014715572150231, + "grad_norm": 2.3231682777404785, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.714522659778595, + "num_tokens": 138970591.0, + "step": 5477 + }, + { + "epoch": 0.6015813749176367, + "grad_norm": 2.2872965335845947, + "learning_rate": 1e-06, + "loss": 1.0562, + "mean_token_accuracy": 0.681661069393158, + "num_tokens": 138993471.0, + "step": 5478 + }, + { + "epoch": 0.6016911926202504, + "grad_norm": 2.0595693588256836, + "learning_rate": 1e-06, + "loss": 1.0007, + "mean_token_accuracy": 0.6934850215911865, + "num_tokens": 139022415.0, + "step": 5479 + }, + { + "epoch": 0.601801010322864, + "grad_norm": 2.3935234546661377, + "learning_rate": 1e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.6915416717529297, + "num_tokens": 139043452.0, + "step": 5480 + }, + { + "epoch": 0.6019108280254777, + "grad_norm": 2.180154323577881, + "learning_rate": 1e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.711414635181427, + "num_tokens": 139068103.0, + "step": 5481 + }, + { + "epoch": 0.6020206457280913, + "grad_norm": 2.3115463256835938, + "learning_rate": 1e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.7184445858001709, + "num_tokens": 139090087.0, + "step": 5482 + }, + { + "epoch": 0.602130463430705, + "grad_norm": 2.48283052444458, + "learning_rate": 1e-06, + "loss": 0.9281, + "mean_token_accuracy": 0.7146414518356323, + "num_tokens": 139111204.0, + "step": 5483 + }, + { + "epoch": 0.6022402811333187, + "grad_norm": 2.8751351833343506, + "learning_rate": 1e-06, + "loss": 0.8692, + "mean_token_accuracy": 0.7280705571174622, + "num_tokens": 139127009.0, + "step": 5484 + }, + { + "epoch": 0.6023500988359324, + "grad_norm": 2.3137075901031494, + "learning_rate": 1e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.7056063413619995, + "num_tokens": 139152060.0, + "step": 5485 + }, + { + "epoch": 0.602459916538546, + "grad_norm": 2.392059087753296, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7179515957832336, + "num_tokens": 139173276.0, + "step": 5486 + }, + { + "epoch": 0.6025697342411597, + "grad_norm": 2.3307340145111084, + "learning_rate": 1e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.715127170085907, + "num_tokens": 139195341.0, + "step": 5487 + }, + { + "epoch": 0.6026795519437733, + "grad_norm": 2.6396286487579346, + "learning_rate": 1e-06, + "loss": 0.8437, + "mean_token_accuracy": 0.7409960031509399, + "num_tokens": 139213357.0, + "step": 5488 + }, + { + "epoch": 0.602789369646387, + "grad_norm": 2.2698259353637695, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.70569908618927, + "num_tokens": 139237504.0, + "step": 5489 + }, + { + "epoch": 0.6028991873490006, + "grad_norm": 2.3311355113983154, + "learning_rate": 1e-06, + "loss": 1.0758, + "mean_token_accuracy": 0.6831043362617493, + "num_tokens": 139265085.0, + "step": 5490 + }, + { + "epoch": 0.6030090050516144, + "grad_norm": 2.389822006225586, + "learning_rate": 1e-06, + "loss": 0.8666, + "mean_token_accuracy": 0.732464075088501, + "num_tokens": 139286359.0, + "step": 5491 + }, + { + "epoch": 0.603118822754228, + "grad_norm": 2.1650314331054688, + "learning_rate": 1e-06, + "loss": 0.9766, + "mean_token_accuracy": 0.706295371055603, + "num_tokens": 139312442.0, + "step": 5492 + }, + { + "epoch": 0.6032286404568417, + "grad_norm": 2.2491960525512695, + "learning_rate": 1e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.7070748209953308, + "num_tokens": 139336185.0, + "step": 5493 + }, + { + "epoch": 0.6033384581594553, + "grad_norm": 2.111419916152954, + "learning_rate": 1e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.718921422958374, + "num_tokens": 139363947.0, + "step": 5494 + }, + { + "epoch": 0.603448275862069, + "grad_norm": 1.9681684970855713, + "learning_rate": 1e-06, + "loss": 1.0784, + "mean_token_accuracy": 0.6849911212921143, + "num_tokens": 139396647.0, + "step": 5495 + }, + { + "epoch": 0.6035580935646826, + "grad_norm": 2.1911416053771973, + "learning_rate": 1e-06, + "loss": 1.0268, + "mean_token_accuracy": 0.6863493919372559, + "num_tokens": 139422013.0, + "step": 5496 + }, + { + "epoch": 0.6036679112672962, + "grad_norm": 2.2566781044006348, + "learning_rate": 1e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.7247757315635681, + "num_tokens": 139444769.0, + "step": 5497 + }, + { + "epoch": 0.60377772896991, + "grad_norm": 2.403796434402466, + "learning_rate": 1e-06, + "loss": 0.8805, + "mean_token_accuracy": 0.7224408388137817, + "num_tokens": 139465644.0, + "step": 5498 + }, + { + "epoch": 0.6038875466725236, + "grad_norm": 2.1763882637023926, + "learning_rate": 1e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.7012078166007996, + "num_tokens": 139491797.0, + "step": 5499 + }, + { + "epoch": 0.6039973643751373, + "grad_norm": 2.108274459838867, + "learning_rate": 1e-06, + "loss": 1.0113, + "mean_token_accuracy": 0.6887596845626831, + "num_tokens": 139517904.0, + "step": 5500 + }, + { + "epoch": 0.6041071820777509, + "grad_norm": 2.0259740352630615, + "learning_rate": 1e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.7121018171310425, + "num_tokens": 139544756.0, + "step": 5501 + }, + { + "epoch": 0.6042169997803646, + "grad_norm": 1.9550445079803467, + "learning_rate": 1e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7123164534568787, + "num_tokens": 139573475.0, + "step": 5502 + }, + { + "epoch": 0.6043268174829782, + "grad_norm": 2.239844799041748, + "learning_rate": 1e-06, + "loss": 0.8915, + "mean_token_accuracy": 0.7175184488296509, + "num_tokens": 139597332.0, + "step": 5503 + }, + { + "epoch": 0.6044366351855919, + "grad_norm": 2.319993257522583, + "learning_rate": 1e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.7058566808700562, + "num_tokens": 139620300.0, + "step": 5504 + }, + { + "epoch": 0.6045464528882056, + "grad_norm": 1.9593117237091064, + "learning_rate": 1e-06, + "loss": 1.0277, + "mean_token_accuracy": 0.6908175945281982, + "num_tokens": 139654269.0, + "step": 5505 + }, + { + "epoch": 0.6046562705908193, + "grad_norm": 2.2460691928863525, + "learning_rate": 1e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.7002675533294678, + "num_tokens": 139680687.0, + "step": 5506 + }, + { + "epoch": 0.6047660882934329, + "grad_norm": 2.120131492614746, + "learning_rate": 1e-06, + "loss": 0.979, + "mean_token_accuracy": 0.7038993835449219, + "num_tokens": 139707622.0, + "step": 5507 + }, + { + "epoch": 0.6048759059960466, + "grad_norm": 2.2088558673858643, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7194795608520508, + "num_tokens": 139732626.0, + "step": 5508 + }, + { + "epoch": 0.6049857236986602, + "grad_norm": 2.2346272468566895, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.716023325920105, + "num_tokens": 139756802.0, + "step": 5509 + }, + { + "epoch": 0.6050955414012739, + "grad_norm": 2.086853265762329, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.6962833404541016, + "num_tokens": 139784555.0, + "step": 5510 + }, + { + "epoch": 0.6052053591038875, + "grad_norm": 2.0035006999969482, + "learning_rate": 1e-06, + "loss": 0.8917, + "mean_token_accuracy": 0.7242966890335083, + "num_tokens": 139814473.0, + "step": 5511 + }, + { + "epoch": 0.6053151768065012, + "grad_norm": 2.328758716583252, + "learning_rate": 1e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.7078355550765991, + "num_tokens": 139835788.0, + "step": 5512 + }, + { + "epoch": 0.6054249945091149, + "grad_norm": 2.10791015625, + "learning_rate": 1e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.7004088163375854, + "num_tokens": 139864479.0, + "step": 5513 + }, + { + "epoch": 0.6055348122117286, + "grad_norm": 2.5662193298339844, + "learning_rate": 1e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.7117232084274292, + "num_tokens": 139884742.0, + "step": 5514 + }, + { + "epoch": 0.6056446299143422, + "grad_norm": 2.1193225383758545, + "learning_rate": 1e-06, + "loss": 1.0282, + "mean_token_accuracy": 0.6855388879776001, + "num_tokens": 139913925.0, + "step": 5515 + }, + { + "epoch": 0.6057544476169558, + "grad_norm": 2.4391586780548096, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7116345167160034, + "num_tokens": 139934992.0, + "step": 5516 + }, + { + "epoch": 0.6058642653195695, + "grad_norm": 2.1527483463287354, + "learning_rate": 1e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.7017678022384644, + "num_tokens": 139962074.0, + "step": 5517 + }, + { + "epoch": 0.6059740830221831, + "grad_norm": 2.160369873046875, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.7055605053901672, + "num_tokens": 139990184.0, + "step": 5518 + }, + { + "epoch": 0.6060839007247968, + "grad_norm": 2.130067825317383, + "learning_rate": 1e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.6907848119735718, + "num_tokens": 140018372.0, + "step": 5519 + }, + { + "epoch": 0.6061937184274105, + "grad_norm": 2.2537291049957275, + "learning_rate": 1e-06, + "loss": 0.9976, + "mean_token_accuracy": 0.6986490488052368, + "num_tokens": 140046914.0, + "step": 5520 + }, + { + "epoch": 0.6063035361300242, + "grad_norm": 2.386847734451294, + "learning_rate": 1e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.6997209191322327, + "num_tokens": 140067639.0, + "step": 5521 + }, + { + "epoch": 0.6064133538326378, + "grad_norm": 2.2363104820251465, + "learning_rate": 1e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.6933257579803467, + "num_tokens": 140092110.0, + "step": 5522 + }, + { + "epoch": 0.6065231715352515, + "grad_norm": 2.508636951446533, + "learning_rate": 1e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.7063525915145874, + "num_tokens": 140112103.0, + "step": 5523 + }, + { + "epoch": 0.6066329892378651, + "grad_norm": 2.015023946762085, + "learning_rate": 1e-06, + "loss": 0.9841, + "mean_token_accuracy": 0.7059286236763, + "num_tokens": 140141441.0, + "step": 5524 + }, + { + "epoch": 0.6067428069404788, + "grad_norm": 1.9758996963500977, + "learning_rate": 1e-06, + "loss": 0.967, + "mean_token_accuracy": 0.7055438160896301, + "num_tokens": 140173147.0, + "step": 5525 + }, + { + "epoch": 0.6068526246430924, + "grad_norm": 2.181767463684082, + "learning_rate": 1e-06, + "loss": 1.0059, + "mean_token_accuracy": 0.6874688863754272, + "num_tokens": 140199373.0, + "step": 5526 + }, + { + "epoch": 0.6069624423457062, + "grad_norm": 2.1939539909362793, + "learning_rate": 1e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7072391510009766, + "num_tokens": 140224867.0, + "step": 5527 + }, + { + "epoch": 0.6070722600483198, + "grad_norm": 2.483283281326294, + "learning_rate": 1e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.7023543119430542, + "num_tokens": 140246272.0, + "step": 5528 + }, + { + "epoch": 0.6071820777509335, + "grad_norm": 2.4423582553863525, + "learning_rate": 1e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.7149752378463745, + "num_tokens": 140267226.0, + "step": 5529 + }, + { + "epoch": 0.6072918954535471, + "grad_norm": 1.9564857482910156, + "learning_rate": 1e-06, + "loss": 1.0065, + "mean_token_accuracy": 0.6983104944229126, + "num_tokens": 140297957.0, + "step": 5530 + }, + { + "epoch": 0.6074017131561608, + "grad_norm": 2.3021843433380127, + "learning_rate": 1e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7093314528465271, + "num_tokens": 140319384.0, + "step": 5531 + }, + { + "epoch": 0.6075115308587744, + "grad_norm": 2.188925266265869, + "learning_rate": 1e-06, + "loss": 1.0191, + "mean_token_accuracy": 0.7003326416015625, + "num_tokens": 140343868.0, + "step": 5532 + }, + { + "epoch": 0.607621348561388, + "grad_norm": 2.184126615524292, + "learning_rate": 1e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.6984813213348389, + "num_tokens": 140371070.0, + "step": 5533 + }, + { + "epoch": 0.6077311662640018, + "grad_norm": 2.3078227043151855, + "learning_rate": 1e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7072066068649292, + "num_tokens": 140393331.0, + "step": 5534 + }, + { + "epoch": 0.6078409839666155, + "grad_norm": 2.1612164974212646, + "learning_rate": 1e-06, + "loss": 0.9858, + "mean_token_accuracy": 0.7014683485031128, + "num_tokens": 140419604.0, + "step": 5535 + }, + { + "epoch": 0.6079508016692291, + "grad_norm": 2.176575183868408, + "learning_rate": 1e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.7085491418838501, + "num_tokens": 140447447.0, + "step": 5536 + }, + { + "epoch": 0.6080606193718427, + "grad_norm": 2.191089391708374, + "learning_rate": 1e-06, + "loss": 0.9964, + "mean_token_accuracy": 0.6991341710090637, + "num_tokens": 140471654.0, + "step": 5537 + }, + { + "epoch": 0.6081704370744564, + "grad_norm": 2.296280860900879, + "learning_rate": 1e-06, + "loss": 0.882, + "mean_token_accuracy": 0.7248832583427429, + "num_tokens": 140493932.0, + "step": 5538 + }, + { + "epoch": 0.60828025477707, + "grad_norm": 2.136638879776001, + "learning_rate": 1e-06, + "loss": 0.9196, + "mean_token_accuracy": 0.7197235822677612, + "num_tokens": 140523643.0, + "step": 5539 + }, + { + "epoch": 0.6083900724796837, + "grad_norm": 2.396955966949463, + "learning_rate": 1e-06, + "loss": 1.072, + "mean_token_accuracy": 0.6782331466674805, + "num_tokens": 140547255.0, + "step": 5540 + }, + { + "epoch": 0.6084998901822973, + "grad_norm": 1.8953477144241333, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.702111005783081, + "num_tokens": 140578171.0, + "step": 5541 + }, + { + "epoch": 0.6086097078849111, + "grad_norm": 2.032575845718384, + "learning_rate": 1e-06, + "loss": 1.0039, + "mean_token_accuracy": 0.6938149929046631, + "num_tokens": 140608439.0, + "step": 5542 + }, + { + "epoch": 0.6087195255875247, + "grad_norm": 2.0985124111175537, + "learning_rate": 1e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.7064275145530701, + "num_tokens": 140637422.0, + "step": 5543 + }, + { + "epoch": 0.6088293432901384, + "grad_norm": 2.35898756980896, + "learning_rate": 1e-06, + "loss": 0.9154, + "mean_token_accuracy": 0.7151910066604614, + "num_tokens": 140660162.0, + "step": 5544 + }, + { + "epoch": 0.608939160992752, + "grad_norm": 1.9377284049987793, + "learning_rate": 1e-06, + "loss": 0.983, + "mean_token_accuracy": 0.7028033137321472, + "num_tokens": 140690974.0, + "step": 5545 + }, + { + "epoch": 0.6090489786953657, + "grad_norm": 2.1466386318206787, + "learning_rate": 1e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.6918255090713501, + "num_tokens": 140718093.0, + "step": 5546 + }, + { + "epoch": 0.6091587963979793, + "grad_norm": 2.109994411468506, + "learning_rate": 1e-06, + "loss": 1.0095, + "mean_token_accuracy": 0.7019022703170776, + "num_tokens": 140745822.0, + "step": 5547 + }, + { + "epoch": 0.609268614100593, + "grad_norm": 2.3707165718078613, + "learning_rate": 1e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.7359408140182495, + "num_tokens": 140767298.0, + "step": 5548 + }, + { + "epoch": 0.6093784318032067, + "grad_norm": 2.7057502269744873, + "learning_rate": 1e-06, + "loss": 0.8854, + "mean_token_accuracy": 0.7345762252807617, + "num_tokens": 140783709.0, + "step": 5549 + }, + { + "epoch": 0.6094882495058204, + "grad_norm": 2.410994529724121, + "learning_rate": 1e-06, + "loss": 1.0463, + "mean_token_accuracy": 0.6895185708999634, + "num_tokens": 140807221.0, + "step": 5550 + }, + { + "epoch": 0.609598067208434, + "grad_norm": 1.9817078113555908, + "learning_rate": 1e-06, + "loss": 1.1016, + "mean_token_accuracy": 0.6693534851074219, + "num_tokens": 140839420.0, + "step": 5551 + }, + { + "epoch": 0.6097078849110477, + "grad_norm": 1.9012540578842163, + "learning_rate": 1e-06, + "loss": 0.9919, + "mean_token_accuracy": 0.6966909170150757, + "num_tokens": 140873416.0, + "step": 5552 + }, + { + "epoch": 0.6098177026136613, + "grad_norm": 2.3747994899749756, + "learning_rate": 1e-06, + "loss": 0.9945, + "mean_token_accuracy": 0.7010992765426636, + "num_tokens": 140897110.0, + "step": 5553 + }, + { + "epoch": 0.609927520316275, + "grad_norm": 2.017932891845703, + "learning_rate": 1e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.6947557926177979, + "num_tokens": 140926950.0, + "step": 5554 + }, + { + "epoch": 0.6100373380188886, + "grad_norm": 2.236203908920288, + "learning_rate": 1e-06, + "loss": 1.0454, + "mean_token_accuracy": 0.6878166198730469, + "num_tokens": 140953302.0, + "step": 5555 + }, + { + "epoch": 0.6101471557215024, + "grad_norm": 2.3706464767456055, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7079750299453735, + "num_tokens": 140976071.0, + "step": 5556 + }, + { + "epoch": 0.610256973424116, + "grad_norm": 2.496725082397461, + "learning_rate": 1e-06, + "loss": 0.8748, + "mean_token_accuracy": 0.738103449344635, + "num_tokens": 140994461.0, + "step": 5557 + }, + { + "epoch": 0.6103667911267296, + "grad_norm": 2.223701000213623, + "learning_rate": 1e-06, + "loss": 0.9979, + "mean_token_accuracy": 0.6977794170379639, + "num_tokens": 141021675.0, + "step": 5558 + }, + { + "epoch": 0.6104766088293433, + "grad_norm": 2.077693223953247, + "learning_rate": 1e-06, + "loss": 1.0085, + "mean_token_accuracy": 0.6920454502105713, + "num_tokens": 141051300.0, + "step": 5559 + }, + { + "epoch": 0.6105864265319569, + "grad_norm": 2.4883644580841064, + "learning_rate": 1e-06, + "loss": 1.001, + "mean_token_accuracy": 0.710139274597168, + "num_tokens": 141074203.0, + "step": 5560 + }, + { + "epoch": 0.6106962442345706, + "grad_norm": 2.0840189456939697, + "learning_rate": 1e-06, + "loss": 0.9523, + "mean_token_accuracy": 0.7093124389648438, + "num_tokens": 141101798.0, + "step": 5561 + }, + { + "epoch": 0.6108060619371842, + "grad_norm": 2.3829123973846436, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.7035508155822754, + "num_tokens": 141125382.0, + "step": 5562 + }, + { + "epoch": 0.610915879639798, + "grad_norm": 2.2147440910339355, + "learning_rate": 1e-06, + "loss": 1.0463, + "mean_token_accuracy": 0.6808139085769653, + "num_tokens": 141151672.0, + "step": 5563 + }, + { + "epoch": 0.6110256973424116, + "grad_norm": 2.3807570934295654, + "learning_rate": 1e-06, + "loss": 1.0324, + "mean_token_accuracy": 0.6943844556808472, + "num_tokens": 141174995.0, + "step": 5564 + }, + { + "epoch": 0.6111355150450253, + "grad_norm": 2.188190221786499, + "learning_rate": 1e-06, + "loss": 1.0107, + "mean_token_accuracy": 0.6904488801956177, + "num_tokens": 141200494.0, + "step": 5565 + }, + { + "epoch": 0.6112453327476389, + "grad_norm": 2.2030632495880127, + "learning_rate": 1e-06, + "loss": 1.0451, + "mean_token_accuracy": 0.6823806762695312, + "num_tokens": 141227030.0, + "step": 5566 + }, + { + "epoch": 0.6113551504502526, + "grad_norm": 2.0686850547790527, + "learning_rate": 1e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.7204891443252563, + "num_tokens": 141253987.0, + "step": 5567 + }, + { + "epoch": 0.6114649681528662, + "grad_norm": 2.2823033332824707, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7166982293128967, + "num_tokens": 141277433.0, + "step": 5568 + }, + { + "epoch": 0.6115747858554799, + "grad_norm": 2.4336726665496826, + "learning_rate": 1e-06, + "loss": 0.8877, + "mean_token_accuracy": 0.7185677289962769, + "num_tokens": 141298744.0, + "step": 5569 + }, + { + "epoch": 0.6116846035580935, + "grad_norm": 1.9448261260986328, + "learning_rate": 1e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.7048377990722656, + "num_tokens": 141331356.0, + "step": 5570 + }, + { + "epoch": 0.6117944212607073, + "grad_norm": 1.8314950466156006, + "learning_rate": 1e-06, + "loss": 0.983, + "mean_token_accuracy": 0.6978573203086853, + "num_tokens": 141365891.0, + "step": 5571 + }, + { + "epoch": 0.6119042389633209, + "grad_norm": 2.1392345428466797, + "learning_rate": 1e-06, + "loss": 0.8228, + "mean_token_accuracy": 0.7426211833953857, + "num_tokens": 141391139.0, + "step": 5572 + }, + { + "epoch": 0.6120140566659346, + "grad_norm": 2.23242449760437, + "learning_rate": 1e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.7146839499473572, + "num_tokens": 141415666.0, + "step": 5573 + }, + { + "epoch": 0.6121238743685482, + "grad_norm": 2.1967623233795166, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.7028751969337463, + "num_tokens": 141441967.0, + "step": 5574 + }, + { + "epoch": 0.6122336920711619, + "grad_norm": 2.364392042160034, + "learning_rate": 1e-06, + "loss": 1.0152, + "mean_token_accuracy": 0.6912158727645874, + "num_tokens": 141465918.0, + "step": 5575 + }, + { + "epoch": 0.6123435097737755, + "grad_norm": 2.5648317337036133, + "learning_rate": 1e-06, + "loss": 0.935, + "mean_token_accuracy": 0.7132704257965088, + "num_tokens": 141485625.0, + "step": 5576 + }, + { + "epoch": 0.6124533274763891, + "grad_norm": 2.4844162464141846, + "learning_rate": 1e-06, + "loss": 0.9843, + "mean_token_accuracy": 0.7005224227905273, + "num_tokens": 141506105.0, + "step": 5577 + }, + { + "epoch": 0.6125631451790029, + "grad_norm": 2.059037685394287, + "learning_rate": 1e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.7255198359489441, + "num_tokens": 141532430.0, + "step": 5578 + }, + { + "epoch": 0.6126729628816165, + "grad_norm": 2.0590922832489014, + "learning_rate": 1e-06, + "loss": 0.9227, + "mean_token_accuracy": 0.7151084542274475, + "num_tokens": 141561833.0, + "step": 5579 + }, + { + "epoch": 0.6127827805842302, + "grad_norm": 2.2435858249664307, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.6971156597137451, + "num_tokens": 141587435.0, + "step": 5580 + }, + { + "epoch": 0.6128925982868438, + "grad_norm": 1.8922646045684814, + "learning_rate": 1e-06, + "loss": 1.0069, + "mean_token_accuracy": 0.6867163777351379, + "num_tokens": 141619940.0, + "step": 5581 + }, + { + "epoch": 0.6130024159894575, + "grad_norm": 2.1564173698425293, + "learning_rate": 1e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.6987310647964478, + "num_tokens": 141645688.0, + "step": 5582 + }, + { + "epoch": 0.6131122336920711, + "grad_norm": 2.227922201156616, + "learning_rate": 1e-06, + "loss": 0.8962, + "mean_token_accuracy": 0.721210241317749, + "num_tokens": 141670065.0, + "step": 5583 + }, + { + "epoch": 0.6132220513946848, + "grad_norm": 2.3216657638549805, + "learning_rate": 1e-06, + "loss": 0.9969, + "mean_token_accuracy": 0.6902389526367188, + "num_tokens": 141692896.0, + "step": 5584 + }, + { + "epoch": 0.6133318690972985, + "grad_norm": 2.181135892868042, + "learning_rate": 1e-06, + "loss": 1.0154, + "mean_token_accuracy": 0.6909267902374268, + "num_tokens": 141720440.0, + "step": 5585 + }, + { + "epoch": 0.6134416867999122, + "grad_norm": 2.161916494369507, + "learning_rate": 1e-06, + "loss": 1.0443, + "mean_token_accuracy": 0.6856404542922974, + "num_tokens": 141746701.0, + "step": 5586 + }, + { + "epoch": 0.6135515045025258, + "grad_norm": 2.1074609756469727, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.71347576379776, + "num_tokens": 141773714.0, + "step": 5587 + }, + { + "epoch": 0.6136613222051395, + "grad_norm": 2.007864236831665, + "learning_rate": 1e-06, + "loss": 0.989, + "mean_token_accuracy": 0.704915463924408, + "num_tokens": 141803026.0, + "step": 5588 + }, + { + "epoch": 0.6137711399077531, + "grad_norm": 2.166477680206299, + "learning_rate": 1e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.7070937156677246, + "num_tokens": 141828313.0, + "step": 5589 + }, + { + "epoch": 0.6138809576103668, + "grad_norm": 2.240187883377075, + "learning_rate": 1e-06, + "loss": 0.8835, + "mean_token_accuracy": 0.7291431427001953, + "num_tokens": 141852437.0, + "step": 5590 + }, + { + "epoch": 0.6139907753129804, + "grad_norm": 2.336357355117798, + "learning_rate": 1e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.7131941318511963, + "num_tokens": 141873905.0, + "step": 5591 + }, + { + "epoch": 0.6141005930155942, + "grad_norm": 2.307293176651001, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7087069153785706, + "num_tokens": 141896777.0, + "step": 5592 + }, + { + "epoch": 0.6142104107182078, + "grad_norm": 2.229130983352661, + "learning_rate": 1e-06, + "loss": 1.0151, + "mean_token_accuracy": 0.6963474750518799, + "num_tokens": 141919807.0, + "step": 5593 + }, + { + "epoch": 0.6143202284208215, + "grad_norm": 2.394425392150879, + "learning_rate": 1e-06, + "loss": 0.91, + "mean_token_accuracy": 0.7190848588943481, + "num_tokens": 141940969.0, + "step": 5594 + }, + { + "epoch": 0.6144300461234351, + "grad_norm": 2.3322858810424805, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.6967114210128784, + "num_tokens": 141965561.0, + "step": 5595 + }, + { + "epoch": 0.6145398638260487, + "grad_norm": 2.2587690353393555, + "learning_rate": 1e-06, + "loss": 1.0149, + "mean_token_accuracy": 0.6874673962593079, + "num_tokens": 141991438.0, + "step": 5596 + }, + { + "epoch": 0.6146496815286624, + "grad_norm": 2.3042426109313965, + "learning_rate": 1e-06, + "loss": 0.8925, + "mean_token_accuracy": 0.7196730375289917, + "num_tokens": 142013709.0, + "step": 5597 + }, + { + "epoch": 0.614759499231276, + "grad_norm": 2.2415313720703125, + "learning_rate": 1e-06, + "loss": 0.8924, + "mean_token_accuracy": 0.7328529357910156, + "num_tokens": 142038825.0, + "step": 5598 + }, + { + "epoch": 0.6148693169338898, + "grad_norm": 2.586444854736328, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7106946706771851, + "num_tokens": 142056868.0, + "step": 5599 + }, + { + "epoch": 0.6149791346365034, + "grad_norm": 2.0426695346832275, + "learning_rate": 1e-06, + "loss": 1.0631, + "mean_token_accuracy": 0.6898918747901917, + "num_tokens": 142086421.0, + "step": 5600 + }, + { + "epoch": 0.6150889523391171, + "grad_norm": 2.1448447704315186, + "learning_rate": 1e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7322443127632141, + "num_tokens": 142110826.0, + "step": 5601 + }, + { + "epoch": 0.6151987700417307, + "grad_norm": 2.3377163410186768, + "learning_rate": 1e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.7154401540756226, + "num_tokens": 142134045.0, + "step": 5602 + }, + { + "epoch": 0.6153085877443444, + "grad_norm": 2.586270332336426, + "learning_rate": 1e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.7178236246109009, + "num_tokens": 142152448.0, + "step": 5603 + }, + { + "epoch": 0.615418405446958, + "grad_norm": 2.4729413986206055, + "learning_rate": 1e-06, + "loss": 1.0073, + "mean_token_accuracy": 0.6937789916992188, + "num_tokens": 142174600.0, + "step": 5604 + }, + { + "epoch": 0.6155282231495717, + "grad_norm": 2.0497376918792725, + "learning_rate": 1e-06, + "loss": 1.0391, + "mean_token_accuracy": 0.6825517416000366, + "num_tokens": 142202483.0, + "step": 5605 + }, + { + "epoch": 0.6156380408521853, + "grad_norm": 2.001594066619873, + "learning_rate": 1e-06, + "loss": 1.0579, + "mean_token_accuracy": 0.6874651908874512, + "num_tokens": 142232329.0, + "step": 5606 + }, + { + "epoch": 0.6157478585547991, + "grad_norm": 2.348541498184204, + "learning_rate": 1e-06, + "loss": 1.0108, + "mean_token_accuracy": 0.6964108347892761, + "num_tokens": 142254636.0, + "step": 5607 + }, + { + "epoch": 0.6158576762574127, + "grad_norm": 2.3853960037231445, + "learning_rate": 1e-06, + "loss": 0.9138, + "mean_token_accuracy": 0.7105420827865601, + "num_tokens": 142276420.0, + "step": 5608 + }, + { + "epoch": 0.6159674939600264, + "grad_norm": 2.6258552074432373, + "learning_rate": 1e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.7084027528762817, + "num_tokens": 142294013.0, + "step": 5609 + }, + { + "epoch": 0.61607731166264, + "grad_norm": 2.146090030670166, + "learning_rate": 1e-06, + "loss": 0.9266, + "mean_token_accuracy": 0.7195640802383423, + "num_tokens": 142319812.0, + "step": 5610 + }, + { + "epoch": 0.6161871293652537, + "grad_norm": 2.302581310272217, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7039698958396912, + "num_tokens": 142344979.0, + "step": 5611 + }, + { + "epoch": 0.6162969470678673, + "grad_norm": 2.293269395828247, + "learning_rate": 1e-06, + "loss": 1.0183, + "mean_token_accuracy": 0.6945513486862183, + "num_tokens": 142368222.0, + "step": 5612 + }, + { + "epoch": 0.616406764770481, + "grad_norm": 2.3243768215179443, + "learning_rate": 1e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.7301777005195618, + "num_tokens": 142390561.0, + "step": 5613 + }, + { + "epoch": 0.6165165824730947, + "grad_norm": 2.306784152984619, + "learning_rate": 1e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.7220681309700012, + "num_tokens": 142415054.0, + "step": 5614 + }, + { + "epoch": 0.6166264001757084, + "grad_norm": 2.2937753200531006, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7036966681480408, + "num_tokens": 142440284.0, + "step": 5615 + }, + { + "epoch": 0.616736217878322, + "grad_norm": 2.1220781803131104, + "learning_rate": 1e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.6917003989219666, + "num_tokens": 142466795.0, + "step": 5616 + }, + { + "epoch": 0.6168460355809356, + "grad_norm": 2.213229179382324, + "learning_rate": 1e-06, + "loss": 1.0374, + "mean_token_accuracy": 0.6868787407875061, + "num_tokens": 142491739.0, + "step": 5617 + }, + { + "epoch": 0.6169558532835493, + "grad_norm": 2.1849405765533447, + "learning_rate": 1e-06, + "loss": 0.9246, + "mean_token_accuracy": 0.7116449475288391, + "num_tokens": 142515769.0, + "step": 5618 + }, + { + "epoch": 0.6170656709861629, + "grad_norm": 1.9625089168548584, + "learning_rate": 1e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.694719672203064, + "num_tokens": 142548840.0, + "step": 5619 + }, + { + "epoch": 0.6171754886887766, + "grad_norm": 2.495041608810425, + "learning_rate": 1e-06, + "loss": 1.041, + "mean_token_accuracy": 0.6866639256477356, + "num_tokens": 142573090.0, + "step": 5620 + }, + { + "epoch": 0.6172853063913903, + "grad_norm": 1.9585524797439575, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7080736756324768, + "num_tokens": 142602723.0, + "step": 5621 + }, + { + "epoch": 0.617395124094004, + "grad_norm": 2.064242124557495, + "learning_rate": 1e-06, + "loss": 0.9635, + "mean_token_accuracy": 0.7020202279090881, + "num_tokens": 142630801.0, + "step": 5622 + }, + { + "epoch": 0.6175049417966176, + "grad_norm": 2.0667288303375244, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7082200050354004, + "num_tokens": 142657950.0, + "step": 5623 + }, + { + "epoch": 0.6176147594992313, + "grad_norm": 2.2015905380249023, + "learning_rate": 1e-06, + "loss": 0.9193, + "mean_token_accuracy": 0.7120568752288818, + "num_tokens": 142682964.0, + "step": 5624 + }, + { + "epoch": 0.6177245772018449, + "grad_norm": 2.047636032104492, + "learning_rate": 1e-06, + "loss": 1.0736, + "mean_token_accuracy": 0.6769825220108032, + "num_tokens": 142712766.0, + "step": 5625 + }, + { + "epoch": 0.6178343949044586, + "grad_norm": 2.375427007675171, + "learning_rate": 1e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.706134557723999, + "num_tokens": 142732666.0, + "step": 5626 + }, + { + "epoch": 0.6179442126070722, + "grad_norm": 2.17871356010437, + "learning_rate": 1e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.6972671151161194, + "num_tokens": 142759279.0, + "step": 5627 + }, + { + "epoch": 0.618054030309686, + "grad_norm": 2.307370662689209, + "learning_rate": 1e-06, + "loss": 0.8775, + "mean_token_accuracy": 0.7258968353271484, + "num_tokens": 142783718.0, + "step": 5628 + }, + { + "epoch": 0.6181638480122996, + "grad_norm": 2.311486005783081, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.7195418477058411, + "num_tokens": 142808987.0, + "step": 5629 + }, + { + "epoch": 0.6182736657149133, + "grad_norm": 2.139176368713379, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7050303220748901, + "num_tokens": 142835598.0, + "step": 5630 + }, + { + "epoch": 0.6183834834175269, + "grad_norm": 2.2374846935272217, + "learning_rate": 1e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.708615243434906, + "num_tokens": 142859716.0, + "step": 5631 + }, + { + "epoch": 0.6184933011201406, + "grad_norm": 1.9846380949020386, + "learning_rate": 1e-06, + "loss": 1.021, + "mean_token_accuracy": 0.6902264356613159, + "num_tokens": 142890843.0, + "step": 5632 + }, + { + "epoch": 0.6186031188227542, + "grad_norm": 1.8970935344696045, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7094461917877197, + "num_tokens": 142924696.0, + "step": 5633 + }, + { + "epoch": 0.6187129365253679, + "grad_norm": 2.2414462566375732, + "learning_rate": 1e-06, + "loss": 1.0232, + "mean_token_accuracy": 0.7022773623466492, + "num_tokens": 142948597.0, + "step": 5634 + }, + { + "epoch": 0.6188227542279815, + "grad_norm": 2.130326271057129, + "learning_rate": 1e-06, + "loss": 1.0569, + "mean_token_accuracy": 0.6895393133163452, + "num_tokens": 142975095.0, + "step": 5635 + }, + { + "epoch": 0.6189325719305953, + "grad_norm": 2.367304563522339, + "learning_rate": 1e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.712098240852356, + "num_tokens": 142997422.0, + "step": 5636 + }, + { + "epoch": 0.6190423896332089, + "grad_norm": 2.153040885925293, + "learning_rate": 1e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.7237225770950317, + "num_tokens": 143023542.0, + "step": 5637 + }, + { + "epoch": 0.6191522073358225, + "grad_norm": 2.192249298095703, + "learning_rate": 1e-06, + "loss": 0.9695, + "mean_token_accuracy": 0.7006871700286865, + "num_tokens": 143049299.0, + "step": 5638 + }, + { + "epoch": 0.6192620250384362, + "grad_norm": 2.034569025039673, + "learning_rate": 1e-06, + "loss": 1.0393, + "mean_token_accuracy": 0.6834951043128967, + "num_tokens": 143078652.0, + "step": 5639 + }, + { + "epoch": 0.6193718427410498, + "grad_norm": 2.2700796127319336, + "learning_rate": 1e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.7052760124206543, + "num_tokens": 143103984.0, + "step": 5640 + }, + { + "epoch": 0.6194816604436635, + "grad_norm": 2.097043991088867, + "learning_rate": 1e-06, + "loss": 0.9817, + "mean_token_accuracy": 0.7063310742378235, + "num_tokens": 143130368.0, + "step": 5641 + }, + { + "epoch": 0.6195914781462771, + "grad_norm": 2.4941728115081787, + "learning_rate": 1e-06, + "loss": 1.0048, + "mean_token_accuracy": 0.6975157856941223, + "num_tokens": 143153988.0, + "step": 5642 + }, + { + "epoch": 0.6197012958488909, + "grad_norm": 2.782867908477783, + "learning_rate": 1e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.714568555355072, + "num_tokens": 143171876.0, + "step": 5643 + }, + { + "epoch": 0.6198111135515045, + "grad_norm": 2.164022445678711, + "learning_rate": 1e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.6960561275482178, + "num_tokens": 143199024.0, + "step": 5644 + }, + { + "epoch": 0.6199209312541182, + "grad_norm": 2.266511917114258, + "learning_rate": 1e-06, + "loss": 1.0187, + "mean_token_accuracy": 0.6862308979034424, + "num_tokens": 143223342.0, + "step": 5645 + }, + { + "epoch": 0.6200307489567318, + "grad_norm": 2.3379671573638916, + "learning_rate": 1e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.7027616500854492, + "num_tokens": 143247154.0, + "step": 5646 + }, + { + "epoch": 0.6201405666593455, + "grad_norm": 2.6514978408813477, + "learning_rate": 1e-06, + "loss": 0.8612, + "mean_token_accuracy": 0.7318208813667297, + "num_tokens": 143265542.0, + "step": 5647 + }, + { + "epoch": 0.6202503843619591, + "grad_norm": 2.120166778564453, + "learning_rate": 1e-06, + "loss": 1.0059, + "mean_token_accuracy": 0.6998648047447205, + "num_tokens": 143292175.0, + "step": 5648 + }, + { + "epoch": 0.6203602020645728, + "grad_norm": 2.365872383117676, + "learning_rate": 1e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.7130941152572632, + "num_tokens": 143314174.0, + "step": 5649 + }, + { + "epoch": 0.6204700197671865, + "grad_norm": 2.4766359329223633, + "learning_rate": 1e-06, + "loss": 0.8449, + "mean_token_accuracy": 0.7379355430603027, + "num_tokens": 143334074.0, + "step": 5650 + }, + { + "epoch": 0.6205798374698002, + "grad_norm": 2.2427144050598145, + "learning_rate": 1e-06, + "loss": 0.8766, + "mean_token_accuracy": 0.7247231602668762, + "num_tokens": 143356560.0, + "step": 5651 + }, + { + "epoch": 0.6206896551724138, + "grad_norm": 2.206346035003662, + "learning_rate": 1e-06, + "loss": 0.907, + "mean_token_accuracy": 0.7209767699241638, + "num_tokens": 143380286.0, + "step": 5652 + }, + { + "epoch": 0.6207994728750275, + "grad_norm": 2.068786382675171, + "learning_rate": 1e-06, + "loss": 1.0025, + "mean_token_accuracy": 0.7008483409881592, + "num_tokens": 143409339.0, + "step": 5653 + }, + { + "epoch": 0.6209092905776411, + "grad_norm": 2.197713851928711, + "learning_rate": 1e-06, + "loss": 1.031, + "mean_token_accuracy": 0.6874078512191772, + "num_tokens": 143435466.0, + "step": 5654 + }, + { + "epoch": 0.6210191082802548, + "grad_norm": 2.046323537826538, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7065280079841614, + "num_tokens": 143463480.0, + "step": 5655 + }, + { + "epoch": 0.6211289259828684, + "grad_norm": 2.2257556915283203, + "learning_rate": 1e-06, + "loss": 0.9937, + "mean_token_accuracy": 0.708396852016449, + "num_tokens": 143487349.0, + "step": 5656 + }, + { + "epoch": 0.6212387436854822, + "grad_norm": 2.3628082275390625, + "learning_rate": 1e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.6954972743988037, + "num_tokens": 143510470.0, + "step": 5657 + }, + { + "epoch": 0.6213485613880958, + "grad_norm": 2.4660372734069824, + "learning_rate": 1e-06, + "loss": 0.9261, + "mean_token_accuracy": 0.712291419506073, + "num_tokens": 143531096.0, + "step": 5658 + }, + { + "epoch": 0.6214583790907094, + "grad_norm": 2.241117238998413, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7148645520210266, + "num_tokens": 143558593.0, + "step": 5659 + }, + { + "epoch": 0.6215681967933231, + "grad_norm": 2.506385087966919, + "learning_rate": 1e-06, + "loss": 0.8032, + "mean_token_accuracy": 0.7456836104393005, + "num_tokens": 143577159.0, + "step": 5660 + }, + { + "epoch": 0.6216780144959367, + "grad_norm": 2.333885669708252, + "learning_rate": 1e-06, + "loss": 0.8903, + "mean_token_accuracy": 0.732727587223053, + "num_tokens": 143598970.0, + "step": 5661 + }, + { + "epoch": 0.6217878321985504, + "grad_norm": 2.635392665863037, + "learning_rate": 1e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.7225569486618042, + "num_tokens": 143617016.0, + "step": 5662 + }, + { + "epoch": 0.621897649901164, + "grad_norm": 2.436575412750244, + "learning_rate": 1e-06, + "loss": 0.9368, + "mean_token_accuracy": 0.7076877355575562, + "num_tokens": 143637546.0, + "step": 5663 + }, + { + "epoch": 0.6220074676037777, + "grad_norm": 1.9002879858016968, + "learning_rate": 1e-06, + "loss": 1.0344, + "mean_token_accuracy": 0.6887516975402832, + "num_tokens": 143670885.0, + "step": 5664 + }, + { + "epoch": 0.6221172853063914, + "grad_norm": 2.4675133228302, + "learning_rate": 1e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.6961330771446228, + "num_tokens": 143693504.0, + "step": 5665 + }, + { + "epoch": 0.6222271030090051, + "grad_norm": 2.272491455078125, + "learning_rate": 1e-06, + "loss": 1.0206, + "mean_token_accuracy": 0.7067294120788574, + "num_tokens": 143717941.0, + "step": 5666 + }, + { + "epoch": 0.6223369207116187, + "grad_norm": 2.0561375617980957, + "learning_rate": 1e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7223571538925171, + "num_tokens": 143744786.0, + "step": 5667 + }, + { + "epoch": 0.6224467384142324, + "grad_norm": 2.408607244491577, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.7113538384437561, + "num_tokens": 143766075.0, + "step": 5668 + }, + { + "epoch": 0.622556556116846, + "grad_norm": 2.0710160732269287, + "learning_rate": 1e-06, + "loss": 1.0739, + "mean_token_accuracy": 0.6864213347434998, + "num_tokens": 143795500.0, + "step": 5669 + }, + { + "epoch": 0.6226663738194597, + "grad_norm": 2.1477091312408447, + "learning_rate": 1e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.7229738235473633, + "num_tokens": 143820918.0, + "step": 5670 + }, + { + "epoch": 0.6227761915220733, + "grad_norm": 2.2013967037200928, + "learning_rate": 1e-06, + "loss": 0.8713, + "mean_token_accuracy": 0.7267066240310669, + "num_tokens": 143844622.0, + "step": 5671 + }, + { + "epoch": 0.6228860092246871, + "grad_norm": 2.2186100482940674, + "learning_rate": 1e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.7080352306365967, + "num_tokens": 143867441.0, + "step": 5672 + }, + { + "epoch": 0.6229958269273007, + "grad_norm": 2.3637325763702393, + "learning_rate": 1e-06, + "loss": 1.0147, + "mean_token_accuracy": 0.6915003061294556, + "num_tokens": 143891137.0, + "step": 5673 + }, + { + "epoch": 0.6231056446299144, + "grad_norm": 2.6568610668182373, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7308574914932251, + "num_tokens": 143909831.0, + "step": 5674 + }, + { + "epoch": 0.623215462332528, + "grad_norm": 2.1562631130218506, + "learning_rate": 1e-06, + "loss": 0.9779, + "mean_token_accuracy": 0.7008930444717407, + "num_tokens": 143934495.0, + "step": 5675 + }, + { + "epoch": 0.6233252800351416, + "grad_norm": 2.314157247543335, + "learning_rate": 1e-06, + "loss": 1.0344, + "mean_token_accuracy": 0.6826165914535522, + "num_tokens": 143960290.0, + "step": 5676 + }, + { + "epoch": 0.6234350977377553, + "grad_norm": 2.0755529403686523, + "learning_rate": 1e-06, + "loss": 0.96, + "mean_token_accuracy": 0.710594117641449, + "num_tokens": 143989779.0, + "step": 5677 + }, + { + "epoch": 0.6235449154403689, + "grad_norm": 2.160996675491333, + "learning_rate": 1e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.7202374935150146, + "num_tokens": 144014975.0, + "step": 5678 + }, + { + "epoch": 0.6236547331429827, + "grad_norm": 2.3450677394866943, + "learning_rate": 1e-06, + "loss": 1.0502, + "mean_token_accuracy": 0.6973021626472473, + "num_tokens": 144039790.0, + "step": 5679 + }, + { + "epoch": 0.6237645508455963, + "grad_norm": 2.3612115383148193, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7049216032028198, + "num_tokens": 144063391.0, + "step": 5680 + }, + { + "epoch": 0.62387436854821, + "grad_norm": 2.1192734241485596, + "learning_rate": 1e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.712544858455658, + "num_tokens": 144091456.0, + "step": 5681 + }, + { + "epoch": 0.6239841862508236, + "grad_norm": 2.5814218521118164, + "learning_rate": 1e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.720731258392334, + "num_tokens": 144112166.0, + "step": 5682 + }, + { + "epoch": 0.6240940039534373, + "grad_norm": 2.0163135528564453, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7076519727706909, + "num_tokens": 144141023.0, + "step": 5683 + }, + { + "epoch": 0.6242038216560509, + "grad_norm": 1.9406514167785645, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.7059849500656128, + "num_tokens": 144172532.0, + "step": 5684 + }, + { + "epoch": 0.6243136393586646, + "grad_norm": 2.0906989574432373, + "learning_rate": 1e-06, + "loss": 0.9309, + "mean_token_accuracy": 0.7182460427284241, + "num_tokens": 144198732.0, + "step": 5685 + }, + { + "epoch": 0.6244234570612783, + "grad_norm": 2.2500011920928955, + "learning_rate": 1e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.7198750972747803, + "num_tokens": 144221279.0, + "step": 5686 + }, + { + "epoch": 0.624533274763892, + "grad_norm": 2.369819402694702, + "learning_rate": 1e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.706631600856781, + "num_tokens": 144243963.0, + "step": 5687 + }, + { + "epoch": 0.6246430924665056, + "grad_norm": 2.388658285140991, + "learning_rate": 1e-06, + "loss": 1.0316, + "mean_token_accuracy": 0.6835215091705322, + "num_tokens": 144266996.0, + "step": 5688 + }, + { + "epoch": 0.6247529101691193, + "grad_norm": 2.358076572418213, + "learning_rate": 1e-06, + "loss": 1.0579, + "mean_token_accuracy": 0.6797771453857422, + "num_tokens": 144292545.0, + "step": 5689 + }, + { + "epoch": 0.6248627278717329, + "grad_norm": 2.14036226272583, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.7039928436279297, + "num_tokens": 144317088.0, + "step": 5690 + }, + { + "epoch": 0.6249725455743466, + "grad_norm": 2.3027076721191406, + "learning_rate": 1e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.6937240362167358, + "num_tokens": 144342113.0, + "step": 5691 + }, + { + "epoch": 0.6250823632769602, + "grad_norm": 2.2062275409698486, + "learning_rate": 1e-06, + "loss": 0.943, + "mean_token_accuracy": 0.7117210626602173, + "num_tokens": 144367855.0, + "step": 5692 + }, + { + "epoch": 0.6251921809795739, + "grad_norm": 2.0445752143859863, + "learning_rate": 1e-06, + "loss": 1.0566, + "mean_token_accuracy": 0.6972670555114746, + "num_tokens": 144398356.0, + "step": 5693 + }, + { + "epoch": 0.6253019986821876, + "grad_norm": 2.039449453353882, + "learning_rate": 1e-06, + "loss": 1.0341, + "mean_token_accuracy": 0.6932616829872131, + "num_tokens": 144428548.0, + "step": 5694 + }, + { + "epoch": 0.6254118163848013, + "grad_norm": 2.271613359451294, + "learning_rate": 1e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.7284649014472961, + "num_tokens": 144453143.0, + "step": 5695 + }, + { + "epoch": 0.6255216340874149, + "grad_norm": 2.264915943145752, + "learning_rate": 1e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.7064454555511475, + "num_tokens": 144475605.0, + "step": 5696 + }, + { + "epoch": 0.6256314517900285, + "grad_norm": 2.185561180114746, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7070372700691223, + "num_tokens": 144501332.0, + "step": 5697 + }, + { + "epoch": 0.6257412694926422, + "grad_norm": 2.2889840602874756, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.6983702182769775, + "num_tokens": 144525096.0, + "step": 5698 + }, + { + "epoch": 0.6258510871952558, + "grad_norm": 2.544872760772705, + "learning_rate": 1e-06, + "loss": 0.9323, + "mean_token_accuracy": 0.7150830626487732, + "num_tokens": 144545328.0, + "step": 5699 + }, + { + "epoch": 0.6259609048978695, + "grad_norm": 2.083372116088867, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.7035399675369263, + "num_tokens": 144572911.0, + "step": 5700 + }, + { + "epoch": 0.6260707226004832, + "grad_norm": 2.15460467338562, + "learning_rate": 1e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.6953126788139343, + "num_tokens": 144599843.0, + "step": 5701 + }, + { + "epoch": 0.6261805403030969, + "grad_norm": 2.2228145599365234, + "learning_rate": 1e-06, + "loss": 1.0126, + "mean_token_accuracy": 0.6920937299728394, + "num_tokens": 144627186.0, + "step": 5702 + }, + { + "epoch": 0.6262903580057105, + "grad_norm": 1.8871268033981323, + "learning_rate": 1e-06, + "loss": 1.0178, + "mean_token_accuracy": 0.6874299049377441, + "num_tokens": 144661028.0, + "step": 5703 + }, + { + "epoch": 0.6264001757083242, + "grad_norm": 2.1551356315612793, + "learning_rate": 1e-06, + "loss": 1.0355, + "mean_token_accuracy": 0.6913174390792847, + "num_tokens": 144692259.0, + "step": 5704 + }, + { + "epoch": 0.6265099934109378, + "grad_norm": 2.313863515853882, + "learning_rate": 1e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7064279317855835, + "num_tokens": 144717649.0, + "step": 5705 + }, + { + "epoch": 0.6266198111135515, + "grad_norm": 2.3161826133728027, + "learning_rate": 1e-06, + "loss": 0.9109, + "mean_token_accuracy": 0.713396430015564, + "num_tokens": 144740397.0, + "step": 5706 + }, + { + "epoch": 0.6267296288161651, + "grad_norm": 2.1417150497436523, + "learning_rate": 1e-06, + "loss": 0.8603, + "mean_token_accuracy": 0.7352290153503418, + "num_tokens": 144765175.0, + "step": 5707 + }, + { + "epoch": 0.6268394465187789, + "grad_norm": 2.234672784805298, + "learning_rate": 1e-06, + "loss": 0.993, + "mean_token_accuracy": 0.6977697610855103, + "num_tokens": 144790309.0, + "step": 5708 + }, + { + "epoch": 0.6269492642213925, + "grad_norm": 2.2581751346588135, + "learning_rate": 1e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7008587121963501, + "num_tokens": 144813497.0, + "step": 5709 + }, + { + "epoch": 0.6270590819240062, + "grad_norm": 2.1972405910491943, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7071289420127869, + "num_tokens": 144838925.0, + "step": 5710 + }, + { + "epoch": 0.6271688996266198, + "grad_norm": 2.2097859382629395, + "learning_rate": 1e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.6985942125320435, + "num_tokens": 144861980.0, + "step": 5711 + }, + { + "epoch": 0.6272787173292335, + "grad_norm": 2.1308178901672363, + "learning_rate": 1e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.7073513269424438, + "num_tokens": 144888043.0, + "step": 5712 + }, + { + "epoch": 0.6273885350318471, + "grad_norm": 2.267426013946533, + "learning_rate": 1e-06, + "loss": 0.9366, + "mean_token_accuracy": 0.7080843448638916, + "num_tokens": 144911216.0, + "step": 5713 + }, + { + "epoch": 0.6274983527344608, + "grad_norm": 2.0974011421203613, + "learning_rate": 1e-06, + "loss": 0.8827, + "mean_token_accuracy": 0.7291049957275391, + "num_tokens": 144938525.0, + "step": 5714 + }, + { + "epoch": 0.6276081704370745, + "grad_norm": 2.3925890922546387, + "learning_rate": 1e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.6970542073249817, + "num_tokens": 144960423.0, + "step": 5715 + }, + { + "epoch": 0.6277179881396882, + "grad_norm": 2.125481367111206, + "learning_rate": 1e-06, + "loss": 0.8889, + "mean_token_accuracy": 0.7221347689628601, + "num_tokens": 144985100.0, + "step": 5716 + }, + { + "epoch": 0.6278278058423018, + "grad_norm": 2.1061489582061768, + "learning_rate": 1e-06, + "loss": 1.0111, + "mean_token_accuracy": 0.6998967528343201, + "num_tokens": 145012933.0, + "step": 5717 + }, + { + "epoch": 0.6279376235449154, + "grad_norm": 2.176675796508789, + "learning_rate": 1e-06, + "loss": 0.9776, + "mean_token_accuracy": 0.7094310522079468, + "num_tokens": 145039004.0, + "step": 5718 + }, + { + "epoch": 0.6280474412475291, + "grad_norm": 2.545145034790039, + "learning_rate": 1e-06, + "loss": 1.011, + "mean_token_accuracy": 0.7001221179962158, + "num_tokens": 145059715.0, + "step": 5719 + }, + { + "epoch": 0.6281572589501427, + "grad_norm": 2.1794846057891846, + "learning_rate": 1e-06, + "loss": 1.0022, + "mean_token_accuracy": 0.6946529746055603, + "num_tokens": 145086150.0, + "step": 5720 + }, + { + "epoch": 0.6282670766527564, + "grad_norm": 2.4259963035583496, + "learning_rate": 1e-06, + "loss": 0.9915, + "mean_token_accuracy": 0.6933284997940063, + "num_tokens": 145109124.0, + "step": 5721 + }, + { + "epoch": 0.62837689435537, + "grad_norm": 2.232224941253662, + "learning_rate": 1e-06, + "loss": 0.8471, + "mean_token_accuracy": 0.7303076386451721, + "num_tokens": 145130051.0, + "step": 5722 + }, + { + "epoch": 0.6284867120579838, + "grad_norm": 2.4858155250549316, + "learning_rate": 1e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.704283595085144, + "num_tokens": 145150713.0, + "step": 5723 + }, + { + "epoch": 0.6285965297605974, + "grad_norm": 2.0027050971984863, + "learning_rate": 1e-06, + "loss": 1.0111, + "mean_token_accuracy": 0.6924492120742798, + "num_tokens": 145181236.0, + "step": 5724 + }, + { + "epoch": 0.6287063474632111, + "grad_norm": 2.4577407836914062, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7162312269210815, + "num_tokens": 145204507.0, + "step": 5725 + }, + { + "epoch": 0.6288161651658247, + "grad_norm": 2.391059398651123, + "learning_rate": 1e-06, + "loss": 1.0174, + "mean_token_accuracy": 0.6957119107246399, + "num_tokens": 145229477.0, + "step": 5726 + }, + { + "epoch": 0.6289259828684384, + "grad_norm": 2.3012478351593018, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7134591937065125, + "num_tokens": 145251513.0, + "step": 5727 + }, + { + "epoch": 0.629035800571052, + "grad_norm": 1.9659619331359863, + "learning_rate": 1e-06, + "loss": 1.056, + "mean_token_accuracy": 0.6757446527481079, + "num_tokens": 145283493.0, + "step": 5728 + }, + { + "epoch": 0.6291456182736657, + "grad_norm": 2.1902334690093994, + "learning_rate": 1e-06, + "loss": 0.9912, + "mean_token_accuracy": 0.694919764995575, + "num_tokens": 145308840.0, + "step": 5729 + }, + { + "epoch": 0.6292554359762794, + "grad_norm": 2.390976905822754, + "learning_rate": 1e-06, + "loss": 0.9117, + "mean_token_accuracy": 0.7247655987739563, + "num_tokens": 145329217.0, + "step": 5730 + }, + { + "epoch": 0.6293652536788931, + "grad_norm": 2.334390163421631, + "learning_rate": 1e-06, + "loss": 1.054, + "mean_token_accuracy": 0.6833178997039795, + "num_tokens": 145354505.0, + "step": 5731 + }, + { + "epoch": 0.6294750713815067, + "grad_norm": 2.2490758895874023, + "learning_rate": 1e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.71736741065979, + "num_tokens": 145378316.0, + "step": 5732 + }, + { + "epoch": 0.6295848890841204, + "grad_norm": 2.601440668106079, + "learning_rate": 1e-06, + "loss": 0.9143, + "mean_token_accuracy": 0.7195615768432617, + "num_tokens": 145398261.0, + "step": 5733 + }, + { + "epoch": 0.629694706786734, + "grad_norm": 2.180642604827881, + "learning_rate": 1e-06, + "loss": 0.9765, + "mean_token_accuracy": 0.7067051529884338, + "num_tokens": 145425074.0, + "step": 5734 + }, + { + "epoch": 0.6298045244893477, + "grad_norm": 2.2137906551361084, + "learning_rate": 1e-06, + "loss": 0.961, + "mean_token_accuracy": 0.708469033241272, + "num_tokens": 145449631.0, + "step": 5735 + }, + { + "epoch": 0.6299143421919613, + "grad_norm": 1.9599705934524536, + "learning_rate": 1e-06, + "loss": 1.025, + "mean_token_accuracy": 0.6946699619293213, + "num_tokens": 145481123.0, + "step": 5736 + }, + { + "epoch": 0.630024159894575, + "grad_norm": 2.3662519454956055, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.7054795026779175, + "num_tokens": 145502455.0, + "step": 5737 + }, + { + "epoch": 0.6301339775971887, + "grad_norm": 2.3859732151031494, + "learning_rate": 1e-06, + "loss": 0.9276, + "mean_token_accuracy": 0.7166121006011963, + "num_tokens": 145524051.0, + "step": 5738 + }, + { + "epoch": 0.6302437952998023, + "grad_norm": 2.0434932708740234, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7126528024673462, + "num_tokens": 145550998.0, + "step": 5739 + }, + { + "epoch": 0.630353613002416, + "grad_norm": 2.372018337249756, + "learning_rate": 1e-06, + "loss": 0.8881, + "mean_token_accuracy": 0.7358143925666809, + "num_tokens": 145571333.0, + "step": 5740 + }, + { + "epoch": 0.6304634307050296, + "grad_norm": 2.148728609085083, + "learning_rate": 1e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.713761031627655, + "num_tokens": 145597031.0, + "step": 5741 + }, + { + "epoch": 0.6305732484076433, + "grad_norm": 2.257356643676758, + "learning_rate": 1e-06, + "loss": 0.9817, + "mean_token_accuracy": 0.6961176991462708, + "num_tokens": 145620437.0, + "step": 5742 + }, + { + "epoch": 0.6306830661102569, + "grad_norm": 2.7210469245910645, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7116604447364807, + "num_tokens": 145637259.0, + "step": 5743 + }, + { + "epoch": 0.6307928838128707, + "grad_norm": 1.881615400314331, + "learning_rate": 1e-06, + "loss": 1.0446, + "mean_token_accuracy": 0.6845535635948181, + "num_tokens": 145669161.0, + "step": 5744 + }, + { + "epoch": 0.6309027015154843, + "grad_norm": 2.1984102725982666, + "learning_rate": 1e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.7020620703697205, + "num_tokens": 145695505.0, + "step": 5745 + }, + { + "epoch": 0.631012519218098, + "grad_norm": 2.120746374130249, + "learning_rate": 1e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.7021210789680481, + "num_tokens": 145720794.0, + "step": 5746 + }, + { + "epoch": 0.6311223369207116, + "grad_norm": 2.047884941101074, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7057352662086487, + "num_tokens": 145747811.0, + "step": 5747 + }, + { + "epoch": 0.6312321546233253, + "grad_norm": 2.387842893600464, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.6978861689567566, + "num_tokens": 145770181.0, + "step": 5748 + }, + { + "epoch": 0.6313419723259389, + "grad_norm": 2.0929970741271973, + "learning_rate": 1e-06, + "loss": 1.0592, + "mean_token_accuracy": 0.6827319860458374, + "num_tokens": 145799478.0, + "step": 5749 + }, + { + "epoch": 0.6314517900285526, + "grad_norm": 2.101919651031494, + "learning_rate": 1e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.714260995388031, + "num_tokens": 145825806.0, + "step": 5750 + }, + { + "epoch": 0.6315616077311663, + "grad_norm": 2.1845524311065674, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.7161276936531067, + "num_tokens": 145849921.0, + "step": 5751 + }, + { + "epoch": 0.63167142543378, + "grad_norm": 2.0186688899993896, + "learning_rate": 1e-06, + "loss": 0.8999, + "mean_token_accuracy": 0.7166451811790466, + "num_tokens": 145879880.0, + "step": 5752 + }, + { + "epoch": 0.6317812431363936, + "grad_norm": 2.569960355758667, + "learning_rate": 1e-06, + "loss": 0.7289, + "mean_token_accuracy": 0.7687838077545166, + "num_tokens": 145897621.0, + "step": 5753 + }, + { + "epoch": 0.6318910608390073, + "grad_norm": 2.4238409996032715, + "learning_rate": 1e-06, + "loss": 0.9506, + "mean_token_accuracy": 0.7123445272445679, + "num_tokens": 145920468.0, + "step": 5754 + }, + { + "epoch": 0.6320008785416209, + "grad_norm": 2.356816530227661, + "learning_rate": 1e-06, + "loss": 1.0329, + "mean_token_accuracy": 0.688033938407898, + "num_tokens": 145943074.0, + "step": 5755 + }, + { + "epoch": 0.6321106962442345, + "grad_norm": 2.638176679611206, + "learning_rate": 1e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.7046043276786804, + "num_tokens": 145962915.0, + "step": 5756 + }, + { + "epoch": 0.6322205139468482, + "grad_norm": 2.170914888381958, + "learning_rate": 1e-06, + "loss": 0.9694, + "mean_token_accuracy": 0.7017903327941895, + "num_tokens": 145987630.0, + "step": 5757 + }, + { + "epoch": 0.6323303316494618, + "grad_norm": 2.3565990924835205, + "learning_rate": 1e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.7083969116210938, + "num_tokens": 146011813.0, + "step": 5758 + }, + { + "epoch": 0.6324401493520756, + "grad_norm": 2.167902708053589, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7125794887542725, + "num_tokens": 146034669.0, + "step": 5759 + }, + { + "epoch": 0.6325499670546892, + "grad_norm": 2.1499361991882324, + "learning_rate": 1e-06, + "loss": 1.0203, + "mean_token_accuracy": 0.7007352709770203, + "num_tokens": 146060916.0, + "step": 5760 + }, + { + "epoch": 0.6326597847573029, + "grad_norm": 2.077674627304077, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.7073508501052856, + "num_tokens": 146087707.0, + "step": 5761 + }, + { + "epoch": 0.6327696024599165, + "grad_norm": 2.54400634765625, + "learning_rate": 1e-06, + "loss": 0.8874, + "mean_token_accuracy": 0.7294454574584961, + "num_tokens": 146106157.0, + "step": 5762 + }, + { + "epoch": 0.6328794201625302, + "grad_norm": 2.2526485919952393, + "learning_rate": 1e-06, + "loss": 0.9523, + "mean_token_accuracy": 0.7051275968551636, + "num_tokens": 146129041.0, + "step": 5763 + }, + { + "epoch": 0.6329892378651438, + "grad_norm": 2.447014570236206, + "learning_rate": 1e-06, + "loss": 0.8952, + "mean_token_accuracy": 0.7324998378753662, + "num_tokens": 146149272.0, + "step": 5764 + }, + { + "epoch": 0.6330990555677575, + "grad_norm": 2.156862735748291, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.7193721532821655, + "num_tokens": 146174838.0, + "step": 5765 + }, + { + "epoch": 0.6332088732703712, + "grad_norm": 2.0103225708007812, + "learning_rate": 1e-06, + "loss": 1.0665, + "mean_token_accuracy": 0.6803052425384521, + "num_tokens": 146203303.0, + "step": 5766 + }, + { + "epoch": 0.6333186909729849, + "grad_norm": 2.2666015625, + "learning_rate": 1e-06, + "loss": 1.0674, + "mean_token_accuracy": 0.6769874691963196, + "num_tokens": 146228640.0, + "step": 5767 + }, + { + "epoch": 0.6334285086755985, + "grad_norm": 1.9989571571350098, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7070651054382324, + "num_tokens": 146256840.0, + "step": 5768 + }, + { + "epoch": 0.6335383263782122, + "grad_norm": 1.9682917594909668, + "learning_rate": 1e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.7048449516296387, + "num_tokens": 146286509.0, + "step": 5769 + }, + { + "epoch": 0.6336481440808258, + "grad_norm": 2.1923282146453857, + "learning_rate": 1e-06, + "loss": 1.005, + "mean_token_accuracy": 0.6938033103942871, + "num_tokens": 146311306.0, + "step": 5770 + }, + { + "epoch": 0.6337579617834395, + "grad_norm": 2.3492586612701416, + "learning_rate": 1e-06, + "loss": 0.9722, + "mean_token_accuracy": 0.7060967683792114, + "num_tokens": 146334647.0, + "step": 5771 + }, + { + "epoch": 0.6338677794860531, + "grad_norm": 2.2262985706329346, + "learning_rate": 1e-06, + "loss": 1.0549, + "mean_token_accuracy": 0.6833597421646118, + "num_tokens": 146359489.0, + "step": 5772 + }, + { + "epoch": 0.6339775971886669, + "grad_norm": 2.4503333568573, + "learning_rate": 1e-06, + "loss": 1.0518, + "mean_token_accuracy": 0.6975399851799011, + "num_tokens": 146380504.0, + "step": 5773 + }, + { + "epoch": 0.6340874148912805, + "grad_norm": 2.210322856903076, + "learning_rate": 1e-06, + "loss": 0.9046, + "mean_token_accuracy": 0.7172839641571045, + "num_tokens": 146403722.0, + "step": 5774 + }, + { + "epoch": 0.6341972325938942, + "grad_norm": 2.27091383934021, + "learning_rate": 1e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7002695798873901, + "num_tokens": 146427178.0, + "step": 5775 + }, + { + "epoch": 0.6343070502965078, + "grad_norm": 2.120974540710449, + "learning_rate": 1e-06, + "loss": 0.9708, + "mean_token_accuracy": 0.7046401500701904, + "num_tokens": 146454966.0, + "step": 5776 + }, + { + "epoch": 0.6344168679991214, + "grad_norm": 2.2733826637268066, + "learning_rate": 1e-06, + "loss": 0.9546, + "mean_token_accuracy": 0.706876277923584, + "num_tokens": 146477488.0, + "step": 5777 + }, + { + "epoch": 0.6345266857017351, + "grad_norm": 2.5174930095672607, + "learning_rate": 1e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.7274433970451355, + "num_tokens": 146496967.0, + "step": 5778 + }, + { + "epoch": 0.6346365034043487, + "grad_norm": 2.3637795448303223, + "learning_rate": 1e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7115920186042786, + "num_tokens": 146519050.0, + "step": 5779 + }, + { + "epoch": 0.6347463211069625, + "grad_norm": 2.2535064220428467, + "learning_rate": 1e-06, + "loss": 1.0071, + "mean_token_accuracy": 0.7005894184112549, + "num_tokens": 146543441.0, + "step": 5780 + }, + { + "epoch": 0.6348561388095761, + "grad_norm": 2.4210872650146484, + "learning_rate": 1e-06, + "loss": 0.9261, + "mean_token_accuracy": 0.7176419496536255, + "num_tokens": 146564663.0, + "step": 5781 + }, + { + "epoch": 0.6349659565121898, + "grad_norm": 2.2554333209991455, + "learning_rate": 1e-06, + "loss": 1.0284, + "mean_token_accuracy": 0.6911448240280151, + "num_tokens": 146589658.0, + "step": 5782 + }, + { + "epoch": 0.6350757742148034, + "grad_norm": 2.387274980545044, + "learning_rate": 1e-06, + "loss": 1.0559, + "mean_token_accuracy": 0.6856535077095032, + "num_tokens": 146612198.0, + "step": 5783 + }, + { + "epoch": 0.6351855919174171, + "grad_norm": 2.045900344848633, + "learning_rate": 1e-06, + "loss": 1.0017, + "mean_token_accuracy": 0.6942719221115112, + "num_tokens": 146640762.0, + "step": 5784 + }, + { + "epoch": 0.6352954096200307, + "grad_norm": 2.2436466217041016, + "learning_rate": 1e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7189339399337769, + "num_tokens": 146663212.0, + "step": 5785 + }, + { + "epoch": 0.6354052273226444, + "grad_norm": 2.1103298664093018, + "learning_rate": 1e-06, + "loss": 1.027, + "mean_token_accuracy": 0.6925429105758667, + "num_tokens": 146692477.0, + "step": 5786 + }, + { + "epoch": 0.635515045025258, + "grad_norm": 1.8822380304336548, + "learning_rate": 1e-06, + "loss": 0.9786, + "mean_token_accuracy": 0.6965029239654541, + "num_tokens": 146728523.0, + "step": 5787 + }, + { + "epoch": 0.6356248627278718, + "grad_norm": 2.096125602722168, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7039331197738647, + "num_tokens": 146756128.0, + "step": 5788 + }, + { + "epoch": 0.6357346804304854, + "grad_norm": 2.273033618927002, + "learning_rate": 1e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.7068122625350952, + "num_tokens": 146780701.0, + "step": 5789 + }, + { + "epoch": 0.6358444981330991, + "grad_norm": 2.0824906826019287, + "learning_rate": 1e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.707556962966919, + "num_tokens": 146809290.0, + "step": 5790 + }, + { + "epoch": 0.6359543158357127, + "grad_norm": 2.3031394481658936, + "learning_rate": 1e-06, + "loss": 0.8534, + "mean_token_accuracy": 0.7327942252159119, + "num_tokens": 146831069.0, + "step": 5791 + }, + { + "epoch": 0.6360641335383264, + "grad_norm": 2.263835906982422, + "learning_rate": 1e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7192068099975586, + "num_tokens": 146854706.0, + "step": 5792 + }, + { + "epoch": 0.63617395124094, + "grad_norm": 2.2456729412078857, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7144918441772461, + "num_tokens": 146878471.0, + "step": 5793 + }, + { + "epoch": 0.6362837689435537, + "grad_norm": 2.176669120788574, + "learning_rate": 1e-06, + "loss": 1.0889, + "mean_token_accuracy": 0.673011302947998, + "num_tokens": 146906569.0, + "step": 5794 + }, + { + "epoch": 0.6363935866461674, + "grad_norm": 2.2100250720977783, + "learning_rate": 1e-06, + "loss": 0.9799, + "mean_token_accuracy": 0.7019290924072266, + "num_tokens": 146931259.0, + "step": 5795 + }, + { + "epoch": 0.636503404348781, + "grad_norm": 1.9274635314941406, + "learning_rate": 1e-06, + "loss": 1.0174, + "mean_token_accuracy": 0.695388674736023, + "num_tokens": 146965913.0, + "step": 5796 + }, + { + "epoch": 0.6366132220513947, + "grad_norm": 1.9464988708496094, + "learning_rate": 1e-06, + "loss": 0.9449, + "mean_token_accuracy": 0.7153780460357666, + "num_tokens": 146998557.0, + "step": 5797 + }, + { + "epoch": 0.6367230397540083, + "grad_norm": 2.1267051696777344, + "learning_rate": 1e-06, + "loss": 1.0313, + "mean_token_accuracy": 0.6845558285713196, + "num_tokens": 147028107.0, + "step": 5798 + }, + { + "epoch": 0.636832857456622, + "grad_norm": 2.409208059310913, + "learning_rate": 1e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.7169378399848938, + "num_tokens": 147052309.0, + "step": 5799 + }, + { + "epoch": 0.6369426751592356, + "grad_norm": 1.93317711353302, + "learning_rate": 1e-06, + "loss": 1.0523, + "mean_token_accuracy": 0.678719162940979, + "num_tokens": 147083639.0, + "step": 5800 + }, + { + "epoch": 0.6370524928618493, + "grad_norm": 2.1452314853668213, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.710091233253479, + "num_tokens": 147109529.0, + "step": 5801 + }, + { + "epoch": 0.637162310564463, + "grad_norm": 2.2040343284606934, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.707183301448822, + "num_tokens": 147133801.0, + "step": 5802 + }, + { + "epoch": 0.6372721282670767, + "grad_norm": 2.1887905597686768, + "learning_rate": 1e-06, + "loss": 0.9635, + "mean_token_accuracy": 0.7033096551895142, + "num_tokens": 147158160.0, + "step": 5803 + }, + { + "epoch": 0.6373819459696903, + "grad_norm": 2.5010507106781006, + "learning_rate": 1e-06, + "loss": 0.9214, + "mean_token_accuracy": 0.7196361422538757, + "num_tokens": 147177077.0, + "step": 5804 + }, + { + "epoch": 0.637491763672304, + "grad_norm": 2.2225472927093506, + "learning_rate": 1e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.6966570615768433, + "num_tokens": 147202126.0, + "step": 5805 + }, + { + "epoch": 0.6376015813749176, + "grad_norm": 2.0487101078033447, + "learning_rate": 1e-06, + "loss": 1.0393, + "mean_token_accuracy": 0.6893577575683594, + "num_tokens": 147231665.0, + "step": 5806 + }, + { + "epoch": 0.6377113990775313, + "grad_norm": 2.3877885341644287, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.7026755213737488, + "num_tokens": 147253296.0, + "step": 5807 + }, + { + "epoch": 0.6378212167801449, + "grad_norm": 2.475130558013916, + "learning_rate": 1e-06, + "loss": 0.8785, + "mean_token_accuracy": 0.7338263988494873, + "num_tokens": 147272841.0, + "step": 5808 + }, + { + "epoch": 0.6379310344827587, + "grad_norm": 2.4451332092285156, + "learning_rate": 1e-06, + "loss": 0.9143, + "mean_token_accuracy": 0.7203536629676819, + "num_tokens": 147293977.0, + "step": 5809 + }, + { + "epoch": 0.6380408521853723, + "grad_norm": 2.283022880554199, + "learning_rate": 1e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.69610196352005, + "num_tokens": 147318873.0, + "step": 5810 + }, + { + "epoch": 0.638150669887986, + "grad_norm": 2.5538856983184814, + "learning_rate": 1e-06, + "loss": 0.9409, + "mean_token_accuracy": 0.7117966413497925, + "num_tokens": 147338833.0, + "step": 5811 + }, + { + "epoch": 0.6382604875905996, + "grad_norm": 2.3839263916015625, + "learning_rate": 1e-06, + "loss": 0.9969, + "mean_token_accuracy": 0.7020424604415894, + "num_tokens": 147366160.0, + "step": 5812 + }, + { + "epoch": 0.6383703052932133, + "grad_norm": 2.3497300148010254, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7169514298439026, + "num_tokens": 147389336.0, + "step": 5813 + }, + { + "epoch": 0.6384801229958269, + "grad_norm": 2.144606590270996, + "learning_rate": 1e-06, + "loss": 0.9896, + "mean_token_accuracy": 0.6976207494735718, + "num_tokens": 147416682.0, + "step": 5814 + }, + { + "epoch": 0.6385899406984406, + "grad_norm": 2.095109462738037, + "learning_rate": 1e-06, + "loss": 0.9643, + "mean_token_accuracy": 0.7042055130004883, + "num_tokens": 147444830.0, + "step": 5815 + }, + { + "epoch": 0.6386997584010542, + "grad_norm": 2.1632540225982666, + "learning_rate": 1e-06, + "loss": 0.999, + "mean_token_accuracy": 0.7008857727050781, + "num_tokens": 147471180.0, + "step": 5816 + }, + { + "epoch": 0.638809576103668, + "grad_norm": 2.2213187217712402, + "learning_rate": 1e-06, + "loss": 0.9249, + "mean_token_accuracy": 0.71600341796875, + "num_tokens": 147494920.0, + "step": 5817 + }, + { + "epoch": 0.6389193938062816, + "grad_norm": 2.341069221496582, + "learning_rate": 1e-06, + "loss": 1.0744, + "mean_token_accuracy": 0.6862151026725769, + "num_tokens": 147517773.0, + "step": 5818 + }, + { + "epoch": 0.6390292115088952, + "grad_norm": 2.0007364749908447, + "learning_rate": 1e-06, + "loss": 1.0476, + "mean_token_accuracy": 0.6839643120765686, + "num_tokens": 147549828.0, + "step": 5819 + }, + { + "epoch": 0.6391390292115089, + "grad_norm": 2.196934461593628, + "learning_rate": 1e-06, + "loss": 1.0111, + "mean_token_accuracy": 0.7050431966781616, + "num_tokens": 147577023.0, + "step": 5820 + }, + { + "epoch": 0.6392488469141225, + "grad_norm": 2.2164766788482666, + "learning_rate": 1e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.7077220678329468, + "num_tokens": 147600622.0, + "step": 5821 + }, + { + "epoch": 0.6393586646167362, + "grad_norm": 2.2587203979492188, + "learning_rate": 1e-06, + "loss": 1.0301, + "mean_token_accuracy": 0.696556568145752, + "num_tokens": 147625757.0, + "step": 5822 + }, + { + "epoch": 0.6394684823193498, + "grad_norm": 2.5485942363739014, + "learning_rate": 1e-06, + "loss": 1.0082, + "mean_token_accuracy": 0.6877862215042114, + "num_tokens": 147646952.0, + "step": 5823 + }, + { + "epoch": 0.6395783000219636, + "grad_norm": 2.249386787414551, + "learning_rate": 1e-06, + "loss": 1.0134, + "mean_token_accuracy": 0.6929464936256409, + "num_tokens": 147672552.0, + "step": 5824 + }, + { + "epoch": 0.6396881177245772, + "grad_norm": 2.6698787212371826, + "learning_rate": 1e-06, + "loss": 1.0413, + "mean_token_accuracy": 0.6922487020492554, + "num_tokens": 147699431.0, + "step": 5825 + }, + { + "epoch": 0.6397979354271909, + "grad_norm": 2.2365376949310303, + "learning_rate": 1e-06, + "loss": 0.9701, + "mean_token_accuracy": 0.7088870406150818, + "num_tokens": 147724168.0, + "step": 5826 + }, + { + "epoch": 0.6399077531298045, + "grad_norm": 2.287313222885132, + "learning_rate": 1e-06, + "loss": 0.9974, + "mean_token_accuracy": 0.7090440988540649, + "num_tokens": 147749249.0, + "step": 5827 + }, + { + "epoch": 0.6400175708324182, + "grad_norm": 2.332202196121216, + "learning_rate": 1e-06, + "loss": 0.9086, + "mean_token_accuracy": 0.7189132571220398, + "num_tokens": 147773701.0, + "step": 5828 + }, + { + "epoch": 0.6401273885350318, + "grad_norm": 2.3733971118927, + "learning_rate": 1e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.7157050967216492, + "num_tokens": 147794837.0, + "step": 5829 + }, + { + "epoch": 0.6402372062376455, + "grad_norm": 2.219104528427124, + "learning_rate": 1e-06, + "loss": 0.911, + "mean_token_accuracy": 0.7181386947631836, + "num_tokens": 147819469.0, + "step": 5830 + }, + { + "epoch": 0.6403470239402592, + "grad_norm": 2.180527687072754, + "learning_rate": 1e-06, + "loss": 0.9748, + "mean_token_accuracy": 0.6990492343902588, + "num_tokens": 147845702.0, + "step": 5831 + }, + { + "epoch": 0.6404568416428729, + "grad_norm": 2.061108350753784, + "learning_rate": 1e-06, + "loss": 0.7701, + "mean_token_accuracy": 0.7483786344528198, + "num_tokens": 147870832.0, + "step": 5832 + }, + { + "epoch": 0.6405666593454865, + "grad_norm": 2.2221157550811768, + "learning_rate": 1e-06, + "loss": 1.0688, + "mean_token_accuracy": 0.6841393709182739, + "num_tokens": 147899563.0, + "step": 5833 + }, + { + "epoch": 0.6406764770481002, + "grad_norm": 2.412323474884033, + "learning_rate": 1e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.6949855089187622, + "num_tokens": 147924365.0, + "step": 5834 + }, + { + "epoch": 0.6407862947507138, + "grad_norm": 2.372511625289917, + "learning_rate": 1e-06, + "loss": 1.0524, + "mean_token_accuracy": 0.6842128038406372, + "num_tokens": 147948120.0, + "step": 5835 + }, + { + "epoch": 0.6408961124533274, + "grad_norm": 2.2980546951293945, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7137758731842041, + "num_tokens": 147970986.0, + "step": 5836 + }, + { + "epoch": 0.6410059301559411, + "grad_norm": 2.0906291007995605, + "learning_rate": 1e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.7284060716629028, + "num_tokens": 147996879.0, + "step": 5837 + }, + { + "epoch": 0.6411157478585549, + "grad_norm": 1.8536123037338257, + "learning_rate": 1e-06, + "loss": 1.0253, + "mean_token_accuracy": 0.6948489546775818, + "num_tokens": 148031469.0, + "step": 5838 + }, + { + "epoch": 0.6412255655611685, + "grad_norm": 2.371267557144165, + "learning_rate": 1e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7030313014984131, + "num_tokens": 148053410.0, + "step": 5839 + }, + { + "epoch": 0.6413353832637821, + "grad_norm": 2.1534104347229004, + "learning_rate": 1e-06, + "loss": 0.9708, + "mean_token_accuracy": 0.709839940071106, + "num_tokens": 148078937.0, + "step": 5840 + }, + { + "epoch": 0.6414452009663958, + "grad_norm": 2.011493682861328, + "learning_rate": 1e-06, + "loss": 0.948, + "mean_token_accuracy": 0.7240927219390869, + "num_tokens": 148107163.0, + "step": 5841 + }, + { + "epoch": 0.6415550186690094, + "grad_norm": 2.0121331214904785, + "learning_rate": 1e-06, + "loss": 1.0113, + "mean_token_accuracy": 0.6951236128807068, + "num_tokens": 148136399.0, + "step": 5842 + }, + { + "epoch": 0.6416648363716231, + "grad_norm": 2.5592010021209717, + "learning_rate": 1e-06, + "loss": 1.0367, + "mean_token_accuracy": 0.6901137828826904, + "num_tokens": 148158080.0, + "step": 5843 + }, + { + "epoch": 0.6417746540742367, + "grad_norm": 2.17087721824646, + "learning_rate": 1e-06, + "loss": 0.9944, + "mean_token_accuracy": 0.6972823143005371, + "num_tokens": 148184449.0, + "step": 5844 + }, + { + "epoch": 0.6418844717768504, + "grad_norm": 2.1694977283477783, + "learning_rate": 1e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.6978780627250671, + "num_tokens": 148210706.0, + "step": 5845 + }, + { + "epoch": 0.6419942894794641, + "grad_norm": 2.4956960678100586, + "learning_rate": 1e-06, + "loss": 0.9768, + "mean_token_accuracy": 0.7012438178062439, + "num_tokens": 148233291.0, + "step": 5846 + }, + { + "epoch": 0.6421041071820778, + "grad_norm": 2.0336856842041016, + "learning_rate": 1e-06, + "loss": 0.8804, + "mean_token_accuracy": 0.7310588359832764, + "num_tokens": 148259543.0, + "step": 5847 + }, + { + "epoch": 0.6422139248846914, + "grad_norm": 2.4362761974334717, + "learning_rate": 1e-06, + "loss": 0.9375, + "mean_token_accuracy": 0.7088316679000854, + "num_tokens": 148282247.0, + "step": 5848 + }, + { + "epoch": 0.6423237425873051, + "grad_norm": 2.0957300662994385, + "learning_rate": 1e-06, + "loss": 1.0843, + "mean_token_accuracy": 0.6738688945770264, + "num_tokens": 148316832.0, + "step": 5849 + }, + { + "epoch": 0.6424335602899187, + "grad_norm": 2.0372390747070312, + "learning_rate": 1e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.690446674823761, + "num_tokens": 148344997.0, + "step": 5850 + }, + { + "epoch": 0.6425433779925324, + "grad_norm": 2.47023344039917, + "learning_rate": 1e-06, + "loss": 0.9455, + "mean_token_accuracy": 0.7148244380950928, + "num_tokens": 148365780.0, + "step": 5851 + }, + { + "epoch": 0.642653195695146, + "grad_norm": 1.9554165601730347, + "learning_rate": 1e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.7182681560516357, + "num_tokens": 148393823.0, + "step": 5852 + }, + { + "epoch": 0.6427630133977598, + "grad_norm": 2.130176067352295, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7060616612434387, + "num_tokens": 148420235.0, + "step": 5853 + }, + { + "epoch": 0.6428728311003734, + "grad_norm": 2.299607515335083, + "learning_rate": 1e-06, + "loss": 0.9444, + "mean_token_accuracy": 0.7141973376274109, + "num_tokens": 148442600.0, + "step": 5854 + }, + { + "epoch": 0.6429826488029871, + "grad_norm": 2.1005678176879883, + "learning_rate": 1e-06, + "loss": 0.9266, + "mean_token_accuracy": 0.7133280634880066, + "num_tokens": 148469163.0, + "step": 5855 + }, + { + "epoch": 0.6430924665056007, + "grad_norm": 2.0789215564727783, + "learning_rate": 1e-06, + "loss": 1.0144, + "mean_token_accuracy": 0.7009724378585815, + "num_tokens": 148497658.0, + "step": 5856 + }, + { + "epoch": 0.6432022842082143, + "grad_norm": 1.9834359884262085, + "learning_rate": 1e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.7122718095779419, + "num_tokens": 148524680.0, + "step": 5857 + }, + { + "epoch": 0.643312101910828, + "grad_norm": 2.0109457969665527, + "learning_rate": 1e-06, + "loss": 1.0278, + "mean_token_accuracy": 0.6878588199615479, + "num_tokens": 148553552.0, + "step": 5858 + }, + { + "epoch": 0.6434219196134416, + "grad_norm": 2.105233907699585, + "learning_rate": 1e-06, + "loss": 1.0317, + "mean_token_accuracy": 0.69535231590271, + "num_tokens": 148581321.0, + "step": 5859 + }, + { + "epoch": 0.6435317373160554, + "grad_norm": 2.6311800479888916, + "learning_rate": 1e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.6927160024642944, + "num_tokens": 148601425.0, + "step": 5860 + }, + { + "epoch": 0.643641555018669, + "grad_norm": 2.282564640045166, + "learning_rate": 1e-06, + "loss": 1.048, + "mean_token_accuracy": 0.6898008584976196, + "num_tokens": 148626543.0, + "step": 5861 + }, + { + "epoch": 0.6437513727212827, + "grad_norm": 2.0672898292541504, + "learning_rate": 1e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.6914820671081543, + "num_tokens": 148657051.0, + "step": 5862 + }, + { + "epoch": 0.6438611904238963, + "grad_norm": 2.272421360015869, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.7091869115829468, + "num_tokens": 148682251.0, + "step": 5863 + }, + { + "epoch": 0.64397100812651, + "grad_norm": 2.1372735500335693, + "learning_rate": 1e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.7074306607246399, + "num_tokens": 148710406.0, + "step": 5864 + }, + { + "epoch": 0.6440808258291236, + "grad_norm": 1.9411147832870483, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7142414450645447, + "num_tokens": 148740724.0, + "step": 5865 + }, + { + "epoch": 0.6441906435317373, + "grad_norm": 2.230525255203247, + "learning_rate": 1e-06, + "loss": 1.0106, + "mean_token_accuracy": 0.6941927671432495, + "num_tokens": 148766176.0, + "step": 5866 + }, + { + "epoch": 0.644300461234351, + "grad_norm": 2.015357494354248, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.7073891758918762, + "num_tokens": 148795657.0, + "step": 5867 + }, + { + "epoch": 0.6444102789369647, + "grad_norm": 2.3517842292785645, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7232604026794434, + "num_tokens": 148818220.0, + "step": 5868 + }, + { + "epoch": 0.6445200966395783, + "grad_norm": 2.2919723987579346, + "learning_rate": 1e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7064261436462402, + "num_tokens": 148840666.0, + "step": 5869 + }, + { + "epoch": 0.644629914342192, + "grad_norm": 2.2838497161865234, + "learning_rate": 1e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.6945421695709229, + "num_tokens": 148865988.0, + "step": 5870 + }, + { + "epoch": 0.6447397320448056, + "grad_norm": 2.2150638103485107, + "learning_rate": 1e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.7092555165290833, + "num_tokens": 148891297.0, + "step": 5871 + }, + { + "epoch": 0.6448495497474193, + "grad_norm": 2.260707378387451, + "learning_rate": 1e-06, + "loss": 0.8908, + "mean_token_accuracy": 0.7211394906044006, + "num_tokens": 148914159.0, + "step": 5872 + }, + { + "epoch": 0.6449593674500329, + "grad_norm": 2.4020581245422363, + "learning_rate": 1e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.7210472822189331, + "num_tokens": 148936416.0, + "step": 5873 + }, + { + "epoch": 0.6450691851526466, + "grad_norm": 2.255321741104126, + "learning_rate": 1e-06, + "loss": 0.948, + "mean_token_accuracy": 0.7106671333312988, + "num_tokens": 148962107.0, + "step": 5874 + }, + { + "epoch": 0.6451790028552603, + "grad_norm": 2.1316142082214355, + "learning_rate": 1e-06, + "loss": 0.9117, + "mean_token_accuracy": 0.7179718017578125, + "num_tokens": 148989509.0, + "step": 5875 + }, + { + "epoch": 0.645288820557874, + "grad_norm": 2.1951208114624023, + "learning_rate": 1e-06, + "loss": 0.8565, + "mean_token_accuracy": 0.7287189364433289, + "num_tokens": 149012870.0, + "step": 5876 + }, + { + "epoch": 0.6453986382604876, + "grad_norm": 2.089169979095459, + "learning_rate": 1e-06, + "loss": 0.9477, + "mean_token_accuracy": 0.7027043700218201, + "num_tokens": 149039697.0, + "step": 5877 + }, + { + "epoch": 0.6455084559631012, + "grad_norm": 2.5089261531829834, + "learning_rate": 1e-06, + "loss": 0.8277, + "mean_token_accuracy": 0.7390415668487549, + "num_tokens": 149058983.0, + "step": 5878 + }, + { + "epoch": 0.6456182736657149, + "grad_norm": 1.8887697458267212, + "learning_rate": 1e-06, + "loss": 0.9943, + "mean_token_accuracy": 0.6940919160842896, + "num_tokens": 149090546.0, + "step": 5879 + }, + { + "epoch": 0.6457280913683285, + "grad_norm": 2.445643186569214, + "learning_rate": 1e-06, + "loss": 0.8126, + "mean_token_accuracy": 0.7420178651809692, + "num_tokens": 149110402.0, + "step": 5880 + }, + { + "epoch": 0.6458379090709422, + "grad_norm": 2.057631254196167, + "learning_rate": 1e-06, + "loss": 1.0304, + "mean_token_accuracy": 0.6841586232185364, + "num_tokens": 149138492.0, + "step": 5881 + }, + { + "epoch": 0.6459477267735559, + "grad_norm": 2.1687874794006348, + "learning_rate": 1e-06, + "loss": 1.0382, + "mean_token_accuracy": 0.6862325668334961, + "num_tokens": 149166365.0, + "step": 5882 + }, + { + "epoch": 0.6460575444761696, + "grad_norm": 2.1698427200317383, + "learning_rate": 1e-06, + "loss": 1.021, + "mean_token_accuracy": 0.6962156891822815, + "num_tokens": 149191752.0, + "step": 5883 + }, + { + "epoch": 0.6461673621787832, + "grad_norm": 2.1018686294555664, + "learning_rate": 1e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.703914225101471, + "num_tokens": 149220583.0, + "step": 5884 + }, + { + "epoch": 0.6462771798813969, + "grad_norm": 2.2594549655914307, + "learning_rate": 1e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.732460618019104, + "num_tokens": 149244463.0, + "step": 5885 + }, + { + "epoch": 0.6463869975840105, + "grad_norm": 2.0920522212982178, + "learning_rate": 1e-06, + "loss": 0.8743, + "mean_token_accuracy": 0.7257784605026245, + "num_tokens": 149270057.0, + "step": 5886 + }, + { + "epoch": 0.6464968152866242, + "grad_norm": 2.1563186645507812, + "learning_rate": 1e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.6988616585731506, + "num_tokens": 149298397.0, + "step": 5887 + }, + { + "epoch": 0.6466066329892378, + "grad_norm": 2.3067028522491455, + "learning_rate": 1e-06, + "loss": 0.9474, + "mean_token_accuracy": 0.7087584733963013, + "num_tokens": 149320793.0, + "step": 5888 + }, + { + "epoch": 0.6467164506918516, + "grad_norm": 2.1979730129241943, + "learning_rate": 1e-06, + "loss": 0.948, + "mean_token_accuracy": 0.7044062614440918, + "num_tokens": 149345807.0, + "step": 5889 + }, + { + "epoch": 0.6468262683944652, + "grad_norm": 2.429561138153076, + "learning_rate": 1e-06, + "loss": 1.0162, + "mean_token_accuracy": 0.6903746128082275, + "num_tokens": 149368381.0, + "step": 5890 + }, + { + "epoch": 0.6469360860970789, + "grad_norm": 2.2539784908294678, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7090040445327759, + "num_tokens": 149391329.0, + "step": 5891 + }, + { + "epoch": 0.6470459037996925, + "grad_norm": 1.9530524015426636, + "learning_rate": 1e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.7041785717010498, + "num_tokens": 149421511.0, + "step": 5892 + }, + { + "epoch": 0.6471557215023062, + "grad_norm": 2.0452682971954346, + "learning_rate": 1e-06, + "loss": 1.0123, + "mean_token_accuracy": 0.6950365304946899, + "num_tokens": 149450380.0, + "step": 5893 + }, + { + "epoch": 0.6472655392049198, + "grad_norm": 2.460766315460205, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7172545194625854, + "num_tokens": 149469905.0, + "step": 5894 + }, + { + "epoch": 0.6473753569075335, + "grad_norm": 2.028216600418091, + "learning_rate": 1e-06, + "loss": 0.928, + "mean_token_accuracy": 0.7170184850692749, + "num_tokens": 149498991.0, + "step": 5895 + }, + { + "epoch": 0.6474851746101472, + "grad_norm": 2.100666046142578, + "learning_rate": 1e-06, + "loss": 1.0781, + "mean_token_accuracy": 0.6807166934013367, + "num_tokens": 149526839.0, + "step": 5896 + }, + { + "epoch": 0.6475949923127609, + "grad_norm": 2.410447835922241, + "learning_rate": 1e-06, + "loss": 0.8831, + "mean_token_accuracy": 0.7280116081237793, + "num_tokens": 149548988.0, + "step": 5897 + }, + { + "epoch": 0.6477048100153745, + "grad_norm": 2.1581454277038574, + "learning_rate": 1e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.6969061493873596, + "num_tokens": 149574848.0, + "step": 5898 + }, + { + "epoch": 0.6478146277179881, + "grad_norm": 2.0308845043182373, + "learning_rate": 1e-06, + "loss": 1.0743, + "mean_token_accuracy": 0.6784504652023315, + "num_tokens": 149605169.0, + "step": 5899 + }, + { + "epoch": 0.6479244454206018, + "grad_norm": 1.9535678625106812, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7090044617652893, + "num_tokens": 149635380.0, + "step": 5900 + }, + { + "epoch": 0.6480342631232154, + "grad_norm": 2.4381890296936035, + "learning_rate": 1e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.719653844833374, + "num_tokens": 149655640.0, + "step": 5901 + }, + { + "epoch": 0.6481440808258291, + "grad_norm": 2.1495094299316406, + "learning_rate": 1e-06, + "loss": 0.966, + "mean_token_accuracy": 0.7105117440223694, + "num_tokens": 149681529.0, + "step": 5902 + }, + { + "epoch": 0.6482538985284428, + "grad_norm": 2.312742233276367, + "learning_rate": 1e-06, + "loss": 0.9097, + "mean_token_accuracy": 0.7212591171264648, + "num_tokens": 149703832.0, + "step": 5903 + }, + { + "epoch": 0.6483637162310565, + "grad_norm": 1.831667423248291, + "learning_rate": 1e-06, + "loss": 1.0601, + "mean_token_accuracy": 0.6788536310195923, + "num_tokens": 149740966.0, + "step": 5904 + }, + { + "epoch": 0.6484735339336701, + "grad_norm": 2.1130378246307373, + "learning_rate": 1e-06, + "loss": 1.0191, + "mean_token_accuracy": 0.6977633237838745, + "num_tokens": 149767803.0, + "step": 5905 + }, + { + "epoch": 0.6485833516362838, + "grad_norm": 2.4777729511260986, + "learning_rate": 1e-06, + "loss": 0.8931, + "mean_token_accuracy": 0.7251249551773071, + "num_tokens": 149788137.0, + "step": 5906 + }, + { + "epoch": 0.6486931693388974, + "grad_norm": 2.2593116760253906, + "learning_rate": 1e-06, + "loss": 1.0447, + "mean_token_accuracy": 0.6923288106918335, + "num_tokens": 149814393.0, + "step": 5907 + }, + { + "epoch": 0.6488029870415111, + "grad_norm": 2.1733977794647217, + "learning_rate": 1e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.7166908383369446, + "num_tokens": 149838365.0, + "step": 5908 + }, + { + "epoch": 0.6489128047441247, + "grad_norm": 2.1516060829162598, + "learning_rate": 1e-06, + "loss": 0.9309, + "mean_token_accuracy": 0.7111413478851318, + "num_tokens": 149864753.0, + "step": 5909 + }, + { + "epoch": 0.6490226224467384, + "grad_norm": 2.3035900592803955, + "learning_rate": 1e-06, + "loss": 0.8562, + "mean_token_accuracy": 0.7339070439338684, + "num_tokens": 149888428.0, + "step": 5910 + }, + { + "epoch": 0.6491324401493521, + "grad_norm": 2.1800951957702637, + "learning_rate": 1e-06, + "loss": 1.039, + "mean_token_accuracy": 0.682630181312561, + "num_tokens": 149915252.0, + "step": 5911 + }, + { + "epoch": 0.6492422578519658, + "grad_norm": 2.2229297161102295, + "learning_rate": 1e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.6959689855575562, + "num_tokens": 149939485.0, + "step": 5912 + }, + { + "epoch": 0.6493520755545794, + "grad_norm": 1.7903093099594116, + "learning_rate": 1e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.705004096031189, + "num_tokens": 149973265.0, + "step": 5913 + }, + { + "epoch": 0.6494618932571931, + "grad_norm": 1.9104299545288086, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7152036428451538, + "num_tokens": 150003280.0, + "step": 5914 + }, + { + "epoch": 0.6495717109598067, + "grad_norm": 2.189969778060913, + "learning_rate": 1e-06, + "loss": 0.9036, + "mean_token_accuracy": 0.7260679602622986, + "num_tokens": 150029391.0, + "step": 5915 + }, + { + "epoch": 0.6496815286624203, + "grad_norm": 2.2026383876800537, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.7030452489852905, + "num_tokens": 150053711.0, + "step": 5916 + }, + { + "epoch": 0.649791346365034, + "grad_norm": 2.0139429569244385, + "learning_rate": 1e-06, + "loss": 0.9902, + "mean_token_accuracy": 0.6987413167953491, + "num_tokens": 150082321.0, + "step": 5917 + }, + { + "epoch": 0.6499011640676478, + "grad_norm": 2.161111831665039, + "learning_rate": 1e-06, + "loss": 1.0062, + "mean_token_accuracy": 0.698918342590332, + "num_tokens": 150110356.0, + "step": 5918 + }, + { + "epoch": 0.6500109817702614, + "grad_norm": 2.5051918029785156, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.7146449089050293, + "num_tokens": 150131756.0, + "step": 5919 + }, + { + "epoch": 0.650120799472875, + "grad_norm": 2.1483442783355713, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.7048729658126831, + "num_tokens": 150157510.0, + "step": 5920 + }, + { + "epoch": 0.6502306171754887, + "grad_norm": 2.3015682697296143, + "learning_rate": 1e-06, + "loss": 0.9553, + "mean_token_accuracy": 0.7121895551681519, + "num_tokens": 150182240.0, + "step": 5921 + }, + { + "epoch": 0.6503404348781023, + "grad_norm": 2.1387906074523926, + "learning_rate": 1e-06, + "loss": 1.0091, + "mean_token_accuracy": 0.6922056674957275, + "num_tokens": 150208610.0, + "step": 5922 + }, + { + "epoch": 0.650450252580716, + "grad_norm": 2.053004741668701, + "learning_rate": 1e-06, + "loss": 0.9789, + "mean_token_accuracy": 0.7021629214286804, + "num_tokens": 150237452.0, + "step": 5923 + }, + { + "epoch": 0.6505600702833296, + "grad_norm": 2.2828612327575684, + "learning_rate": 1e-06, + "loss": 0.8716, + "mean_token_accuracy": 0.7341289520263672, + "num_tokens": 150259128.0, + "step": 5924 + }, + { + "epoch": 0.6506698879859434, + "grad_norm": 2.155339479446411, + "learning_rate": 1e-06, + "loss": 0.9991, + "mean_token_accuracy": 0.6981340050697327, + "num_tokens": 150284648.0, + "step": 5925 + }, + { + "epoch": 0.650779705688557, + "grad_norm": 2.285146713256836, + "learning_rate": 1e-06, + "loss": 0.9948, + "mean_token_accuracy": 0.6975045800209045, + "num_tokens": 150309988.0, + "step": 5926 + }, + { + "epoch": 0.6508895233911707, + "grad_norm": 1.937778115272522, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7074586153030396, + "num_tokens": 150340037.0, + "step": 5927 + }, + { + "epoch": 0.6509993410937843, + "grad_norm": 2.0507473945617676, + "learning_rate": 1e-06, + "loss": 1.0497, + "mean_token_accuracy": 0.68907231092453, + "num_tokens": 150368251.0, + "step": 5928 + }, + { + "epoch": 0.651109158796398, + "grad_norm": 1.9970890283584595, + "learning_rate": 1e-06, + "loss": 1.0497, + "mean_token_accuracy": 0.6832448244094849, + "num_tokens": 150397890.0, + "step": 5929 + }, + { + "epoch": 0.6512189764990116, + "grad_norm": 2.097792863845825, + "learning_rate": 1e-06, + "loss": 1.0123, + "mean_token_accuracy": 0.6926143169403076, + "num_tokens": 150424789.0, + "step": 5930 + }, + { + "epoch": 0.6513287942016253, + "grad_norm": 2.1809158325195312, + "learning_rate": 1e-06, + "loss": 1.0984, + "mean_token_accuracy": 0.6694746017456055, + "num_tokens": 150452428.0, + "step": 5931 + }, + { + "epoch": 0.651438611904239, + "grad_norm": 1.9341002702713013, + "learning_rate": 1e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.7012938261032104, + "num_tokens": 150483510.0, + "step": 5932 + }, + { + "epoch": 0.6515484296068527, + "grad_norm": 2.4158668518066406, + "learning_rate": 1e-06, + "loss": 0.869, + "mean_token_accuracy": 0.7254488468170166, + "num_tokens": 150502945.0, + "step": 5933 + }, + { + "epoch": 0.6516582473094663, + "grad_norm": 2.0204856395721436, + "learning_rate": 1e-06, + "loss": 0.9776, + "mean_token_accuracy": 0.7097630500793457, + "num_tokens": 150530709.0, + "step": 5934 + }, + { + "epoch": 0.65176806501208, + "grad_norm": 2.140345335006714, + "learning_rate": 1e-06, + "loss": 1.0124, + "mean_token_accuracy": 0.7031127214431763, + "num_tokens": 150556579.0, + "step": 5935 + }, + { + "epoch": 0.6518778827146936, + "grad_norm": 2.525942087173462, + "learning_rate": 1e-06, + "loss": 0.8611, + "mean_token_accuracy": 0.7316783666610718, + "num_tokens": 150575096.0, + "step": 5936 + }, + { + "epoch": 0.6519877004173072, + "grad_norm": 2.2745981216430664, + "learning_rate": 1e-06, + "loss": 0.926, + "mean_token_accuracy": 0.7189786434173584, + "num_tokens": 150597023.0, + "step": 5937 + }, + { + "epoch": 0.6520975181199209, + "grad_norm": 2.1692054271698, + "learning_rate": 1e-06, + "loss": 1.0411, + "mean_token_accuracy": 0.6787137985229492, + "num_tokens": 150623746.0, + "step": 5938 + }, + { + "epoch": 0.6522073358225345, + "grad_norm": 2.64359974861145, + "learning_rate": 1e-06, + "loss": 0.8713, + "mean_token_accuracy": 0.7274688482284546, + "num_tokens": 150642003.0, + "step": 5939 + }, + { + "epoch": 0.6523171535251483, + "grad_norm": 2.2097678184509277, + "learning_rate": 1e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.6980202198028564, + "num_tokens": 150668165.0, + "step": 5940 + }, + { + "epoch": 0.6524269712277619, + "grad_norm": 2.4235074520111084, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7118088006973267, + "num_tokens": 150692765.0, + "step": 5941 + }, + { + "epoch": 0.6525367889303756, + "grad_norm": 2.4381537437438965, + "learning_rate": 1e-06, + "loss": 1.0263, + "mean_token_accuracy": 0.6850495338439941, + "num_tokens": 150714314.0, + "step": 5942 + }, + { + "epoch": 0.6526466066329892, + "grad_norm": 2.183516025543213, + "learning_rate": 1e-06, + "loss": 1.0025, + "mean_token_accuracy": 0.6878527402877808, + "num_tokens": 150741905.0, + "step": 5943 + }, + { + "epoch": 0.6527564243356029, + "grad_norm": 2.0955405235290527, + "learning_rate": 1e-06, + "loss": 0.949, + "mean_token_accuracy": 0.7062681913375854, + "num_tokens": 150769913.0, + "step": 5944 + }, + { + "epoch": 0.6528662420382165, + "grad_norm": 2.0256733894348145, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7072026133537292, + "num_tokens": 150796724.0, + "step": 5945 + }, + { + "epoch": 0.6529760597408302, + "grad_norm": 2.1282200813293457, + "learning_rate": 1e-06, + "loss": 1.0638, + "mean_token_accuracy": 0.6771254539489746, + "num_tokens": 150823777.0, + "step": 5946 + }, + { + "epoch": 0.6530858774434439, + "grad_norm": 2.1877808570861816, + "learning_rate": 1e-06, + "loss": 0.9552, + "mean_token_accuracy": 0.7076050043106079, + "num_tokens": 150848686.0, + "step": 5947 + }, + { + "epoch": 0.6531956951460576, + "grad_norm": 2.370326280593872, + "learning_rate": 1e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.7150808572769165, + "num_tokens": 150871056.0, + "step": 5948 + }, + { + "epoch": 0.6533055128486712, + "grad_norm": 2.155324935913086, + "learning_rate": 1e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.7019518613815308, + "num_tokens": 150896255.0, + "step": 5949 + }, + { + "epoch": 0.6534153305512849, + "grad_norm": 2.405463933944702, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7049130797386169, + "num_tokens": 150920767.0, + "step": 5950 + }, + { + "epoch": 0.6535251482538985, + "grad_norm": 2.5064473152160645, + "learning_rate": 1e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.7194204330444336, + "num_tokens": 150941656.0, + "step": 5951 + }, + { + "epoch": 0.6536349659565122, + "grad_norm": 2.0923879146575928, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7174800038337708, + "num_tokens": 150968795.0, + "step": 5952 + }, + { + "epoch": 0.6537447836591258, + "grad_norm": 2.2526605129241943, + "learning_rate": 1e-06, + "loss": 0.8878, + "mean_token_accuracy": 0.7264657616615295, + "num_tokens": 150993572.0, + "step": 5953 + }, + { + "epoch": 0.6538546013617396, + "grad_norm": 2.259075403213501, + "learning_rate": 1e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.717764139175415, + "num_tokens": 151018149.0, + "step": 5954 + }, + { + "epoch": 0.6539644190643532, + "grad_norm": 2.1787476539611816, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7118034362792969, + "num_tokens": 151044929.0, + "step": 5955 + }, + { + "epoch": 0.6540742367669669, + "grad_norm": 2.5015814304351807, + "learning_rate": 1e-06, + "loss": 0.9224, + "mean_token_accuracy": 0.715476393699646, + "num_tokens": 151065183.0, + "step": 5956 + }, + { + "epoch": 0.6541840544695805, + "grad_norm": 2.1587767601013184, + "learning_rate": 1e-06, + "loss": 0.8728, + "mean_token_accuracy": 0.7309917211532593, + "num_tokens": 151089091.0, + "step": 5957 + }, + { + "epoch": 0.6542938721721941, + "grad_norm": 2.0617241859436035, + "learning_rate": 1e-06, + "loss": 1.0906, + "mean_token_accuracy": 0.6824586391448975, + "num_tokens": 151116292.0, + "step": 5958 + }, + { + "epoch": 0.6544036898748078, + "grad_norm": 2.2271249294281006, + "learning_rate": 1e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.7034907937049866, + "num_tokens": 151141248.0, + "step": 5959 + }, + { + "epoch": 0.6545135075774214, + "grad_norm": 2.5476911067962646, + "learning_rate": 1e-06, + "loss": 0.8343, + "mean_token_accuracy": 0.7359710335731506, + "num_tokens": 151159832.0, + "step": 5960 + }, + { + "epoch": 0.6546233252800352, + "grad_norm": 2.1620876789093018, + "learning_rate": 1e-06, + "loss": 0.9859, + "mean_token_accuracy": 0.6989967823028564, + "num_tokens": 151185389.0, + "step": 5961 + }, + { + "epoch": 0.6547331429826488, + "grad_norm": 2.1292026042938232, + "learning_rate": 1e-06, + "loss": 0.9916, + "mean_token_accuracy": 0.695624828338623, + "num_tokens": 151211987.0, + "step": 5962 + }, + { + "epoch": 0.6548429606852625, + "grad_norm": 2.2967207431793213, + "learning_rate": 1e-06, + "loss": 0.94, + "mean_token_accuracy": 0.7444601058959961, + "num_tokens": 151235321.0, + "step": 5963 + }, + { + "epoch": 0.6549527783878761, + "grad_norm": 2.3431429862976074, + "learning_rate": 1e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.7040601372718811, + "num_tokens": 151256558.0, + "step": 5964 + }, + { + "epoch": 0.6550625960904898, + "grad_norm": 1.9045557975769043, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.7021963000297546, + "num_tokens": 151289357.0, + "step": 5965 + }, + { + "epoch": 0.6551724137931034, + "grad_norm": 2.1508381366729736, + "learning_rate": 1e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7163447737693787, + "num_tokens": 151315437.0, + "step": 5966 + }, + { + "epoch": 0.6552822314957171, + "grad_norm": 1.980650782585144, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7077821493148804, + "num_tokens": 151342874.0, + "step": 5967 + }, + { + "epoch": 0.6553920491983307, + "grad_norm": 2.20049786567688, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.7040597200393677, + "num_tokens": 151368181.0, + "step": 5968 + }, + { + "epoch": 0.6555018669009445, + "grad_norm": 2.34676194190979, + "learning_rate": 1e-06, + "loss": 0.8737, + "mean_token_accuracy": 0.7294469475746155, + "num_tokens": 151390322.0, + "step": 5969 + }, + { + "epoch": 0.6556116846035581, + "grad_norm": 2.489331007003784, + "learning_rate": 1e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.7136253118515015, + "num_tokens": 151408878.0, + "step": 5970 + }, + { + "epoch": 0.6557215023061718, + "grad_norm": 2.4670069217681885, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7082186937332153, + "num_tokens": 151431857.0, + "step": 5971 + }, + { + "epoch": 0.6558313200087854, + "grad_norm": 2.1391849517822266, + "learning_rate": 1e-06, + "loss": 0.8421, + "mean_token_accuracy": 0.7362914085388184, + "num_tokens": 151455471.0, + "step": 5972 + }, + { + "epoch": 0.6559411377113991, + "grad_norm": 2.2358336448669434, + "learning_rate": 1e-06, + "loss": 1.0016, + "mean_token_accuracy": 0.6998792290687561, + "num_tokens": 151480998.0, + "step": 5973 + }, + { + "epoch": 0.6560509554140127, + "grad_norm": 2.2033605575561523, + "learning_rate": 1e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.6988337635993958, + "num_tokens": 151506239.0, + "step": 5974 + }, + { + "epoch": 0.6561607731166264, + "grad_norm": 2.1575605869293213, + "learning_rate": 1e-06, + "loss": 0.9191, + "mean_token_accuracy": 0.7129644155502319, + "num_tokens": 151531593.0, + "step": 5975 + }, + { + "epoch": 0.6562705908192401, + "grad_norm": 2.33225679397583, + "learning_rate": 1e-06, + "loss": 1.039, + "mean_token_accuracy": 0.6864681243896484, + "num_tokens": 151555816.0, + "step": 5976 + }, + { + "epoch": 0.6563804085218538, + "grad_norm": 2.0687108039855957, + "learning_rate": 1e-06, + "loss": 0.8716, + "mean_token_accuracy": 0.7258811593055725, + "num_tokens": 151581288.0, + "step": 5977 + }, + { + "epoch": 0.6564902262244674, + "grad_norm": 2.039824962615967, + "learning_rate": 1e-06, + "loss": 1.0365, + "mean_token_accuracy": 0.6844369173049927, + "num_tokens": 151609458.0, + "step": 5978 + }, + { + "epoch": 0.656600043927081, + "grad_norm": 2.0860469341278076, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.7057430744171143, + "num_tokens": 151636953.0, + "step": 5979 + }, + { + "epoch": 0.6567098616296947, + "grad_norm": 2.1310160160064697, + "learning_rate": 1e-06, + "loss": 0.9256, + "mean_token_accuracy": 0.7252601385116577, + "num_tokens": 151661803.0, + "step": 5980 + }, + { + "epoch": 0.6568196793323083, + "grad_norm": 2.454613447189331, + "learning_rate": 1e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.6977529525756836, + "num_tokens": 151682627.0, + "step": 5981 + }, + { + "epoch": 0.656929497034922, + "grad_norm": 2.1490561962127686, + "learning_rate": 1e-06, + "loss": 0.8878, + "mean_token_accuracy": 0.7223876714706421, + "num_tokens": 151706987.0, + "step": 5982 + }, + { + "epoch": 0.6570393147375357, + "grad_norm": 2.0673651695251465, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7082433104515076, + "num_tokens": 151734944.0, + "step": 5983 + }, + { + "epoch": 0.6571491324401494, + "grad_norm": 2.3336613178253174, + "learning_rate": 1e-06, + "loss": 1.045, + "mean_token_accuracy": 0.6976135969161987, + "num_tokens": 151758018.0, + "step": 5984 + }, + { + "epoch": 0.657258950142763, + "grad_norm": 2.2948527336120605, + "learning_rate": 1e-06, + "loss": 1.0608, + "mean_token_accuracy": 0.6770957708358765, + "num_tokens": 151782918.0, + "step": 5985 + }, + { + "epoch": 0.6573687678453767, + "grad_norm": 2.6467466354370117, + "learning_rate": 1e-06, + "loss": 0.8523, + "mean_token_accuracy": 0.7331468462944031, + "num_tokens": 151801452.0, + "step": 5986 + }, + { + "epoch": 0.6574785855479903, + "grad_norm": 2.31964111328125, + "learning_rate": 1e-06, + "loss": 0.8647, + "mean_token_accuracy": 0.7341270446777344, + "num_tokens": 151823914.0, + "step": 5987 + }, + { + "epoch": 0.657588403250604, + "grad_norm": 2.21355938911438, + "learning_rate": 1e-06, + "loss": 0.903, + "mean_token_accuracy": 0.7284880876541138, + "num_tokens": 151848475.0, + "step": 5988 + }, + { + "epoch": 0.6576982209532176, + "grad_norm": 2.377265691757202, + "learning_rate": 1e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.7096619009971619, + "num_tokens": 151870212.0, + "step": 5989 + }, + { + "epoch": 0.6578080386558314, + "grad_norm": 2.4459068775177, + "learning_rate": 1e-06, + "loss": 0.927, + "mean_token_accuracy": 0.7087670564651489, + "num_tokens": 151891466.0, + "step": 5990 + }, + { + "epoch": 0.657917856358445, + "grad_norm": 2.4019088745117188, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.698623776435852, + "num_tokens": 151915120.0, + "step": 5991 + }, + { + "epoch": 0.6580276740610587, + "grad_norm": 2.2504453659057617, + "learning_rate": 1e-06, + "loss": 1.0022, + "mean_token_accuracy": 0.697431206703186, + "num_tokens": 151939867.0, + "step": 5992 + }, + { + "epoch": 0.6581374917636723, + "grad_norm": 2.1746017932891846, + "learning_rate": 1e-06, + "loss": 0.8441, + "mean_token_accuracy": 0.7503368854522705, + "num_tokens": 151963771.0, + "step": 5993 + }, + { + "epoch": 0.658247309466286, + "grad_norm": 2.4827237129211426, + "learning_rate": 1e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7084943056106567, + "num_tokens": 151983216.0, + "step": 5994 + }, + { + "epoch": 0.6583571271688996, + "grad_norm": 2.12009596824646, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.7048954367637634, + "num_tokens": 152009130.0, + "step": 5995 + }, + { + "epoch": 0.6584669448715132, + "grad_norm": 2.0719552040100098, + "learning_rate": 1e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.7022150158882141, + "num_tokens": 152036899.0, + "step": 5996 + }, + { + "epoch": 0.6585767625741269, + "grad_norm": 2.17026948928833, + "learning_rate": 1e-06, + "loss": 0.9065, + "mean_token_accuracy": 0.7271174192428589, + "num_tokens": 152062311.0, + "step": 5997 + }, + { + "epoch": 0.6586865802767407, + "grad_norm": 2.365169048309326, + "learning_rate": 1e-06, + "loss": 1.0132, + "mean_token_accuracy": 0.7034039497375488, + "num_tokens": 152084434.0, + "step": 5998 + }, + { + "epoch": 0.6587963979793543, + "grad_norm": 1.9151506423950195, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7106022238731384, + "num_tokens": 152116468.0, + "step": 5999 + }, + { + "epoch": 0.6589062156819679, + "grad_norm": 1.9867650270462036, + "learning_rate": 1e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7215536832809448, + "num_tokens": 152143107.0, + "step": 6000 + }, + { + "epoch": 0.6590160333845816, + "grad_norm": 2.1183319091796875, + "learning_rate": 1e-06, + "loss": 1.068, + "mean_token_accuracy": 0.6834422945976257, + "num_tokens": 152169217.0, + "step": 6001 + }, + { + "epoch": 0.6591258510871952, + "grad_norm": 2.3952620029449463, + "learning_rate": 1e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.7151056528091431, + "num_tokens": 152190034.0, + "step": 6002 + }, + { + "epoch": 0.6592356687898089, + "grad_norm": 2.4010255336761475, + "learning_rate": 1e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.7137499451637268, + "num_tokens": 152211192.0, + "step": 6003 + }, + { + "epoch": 0.6593454864924225, + "grad_norm": 2.4264109134674072, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7054193019866943, + "num_tokens": 152231957.0, + "step": 6004 + }, + { + "epoch": 0.6594553041950363, + "grad_norm": 2.199437141418457, + "learning_rate": 1e-06, + "loss": 0.9037, + "mean_token_accuracy": 0.7161535024642944, + "num_tokens": 152255321.0, + "step": 6005 + }, + { + "epoch": 0.6595651218976499, + "grad_norm": 2.3030543327331543, + "learning_rate": 1e-06, + "loss": 0.8937, + "mean_token_accuracy": 0.7265468835830688, + "num_tokens": 152276946.0, + "step": 6006 + }, + { + "epoch": 0.6596749396002636, + "grad_norm": 2.203511953353882, + "learning_rate": 1e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.7044182419776917, + "num_tokens": 152302168.0, + "step": 6007 + }, + { + "epoch": 0.6597847573028772, + "grad_norm": 2.1256563663482666, + "learning_rate": 1e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.6960402727127075, + "num_tokens": 152329997.0, + "step": 6008 + }, + { + "epoch": 0.6598945750054909, + "grad_norm": 2.4728000164031982, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7096377611160278, + "num_tokens": 152351257.0, + "step": 6009 + }, + { + "epoch": 0.6600043927081045, + "grad_norm": 2.3213772773742676, + "learning_rate": 1e-06, + "loss": 0.9809, + "mean_token_accuracy": 0.700541615486145, + "num_tokens": 152375095.0, + "step": 6010 + }, + { + "epoch": 0.6601142104107182, + "grad_norm": 2.065004825592041, + "learning_rate": 1e-06, + "loss": 0.8648, + "mean_token_accuracy": 0.7228280305862427, + "num_tokens": 152400708.0, + "step": 6011 + }, + { + "epoch": 0.6602240281133319, + "grad_norm": 2.278968572616577, + "learning_rate": 1e-06, + "loss": 1.0171, + "mean_token_accuracy": 0.6889193058013916, + "num_tokens": 152424948.0, + "step": 6012 + }, + { + "epoch": 0.6603338458159456, + "grad_norm": 2.3862202167510986, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.7092159390449524, + "num_tokens": 152448385.0, + "step": 6013 + }, + { + "epoch": 0.6604436635185592, + "grad_norm": 2.0454983711242676, + "learning_rate": 1e-06, + "loss": 1.0177, + "mean_token_accuracy": 0.6894329786300659, + "num_tokens": 152480502.0, + "step": 6014 + }, + { + "epoch": 0.6605534812211729, + "grad_norm": 2.2318403720855713, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.6997308135032654, + "num_tokens": 152506113.0, + "step": 6015 + }, + { + "epoch": 0.6606632989237865, + "grad_norm": 2.267021894454956, + "learning_rate": 1e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.6937423348426819, + "num_tokens": 152531620.0, + "step": 6016 + }, + { + "epoch": 0.6607731166264001, + "grad_norm": 2.3615260124206543, + "learning_rate": 1e-06, + "loss": 1.0002, + "mean_token_accuracy": 0.7044066190719604, + "num_tokens": 152553614.0, + "step": 6017 + }, + { + "epoch": 0.6608829343290138, + "grad_norm": 2.161259174346924, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7107357382774353, + "num_tokens": 152577887.0, + "step": 6018 + }, + { + "epoch": 0.6609927520316276, + "grad_norm": 2.3251285552978516, + "learning_rate": 1e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.7182475328445435, + "num_tokens": 152599745.0, + "step": 6019 + }, + { + "epoch": 0.6611025697342412, + "grad_norm": 1.92031991481781, + "learning_rate": 1e-06, + "loss": 1.1011, + "mean_token_accuracy": 0.6744036674499512, + "num_tokens": 152634559.0, + "step": 6020 + }, + { + "epoch": 0.6612123874368548, + "grad_norm": 1.994775652885437, + "learning_rate": 1e-06, + "loss": 0.9799, + "mean_token_accuracy": 0.7055755257606506, + "num_tokens": 152662721.0, + "step": 6021 + }, + { + "epoch": 0.6613222051394685, + "grad_norm": 2.1988461017608643, + "learning_rate": 1e-06, + "loss": 0.885, + "mean_token_accuracy": 0.7259217500686646, + "num_tokens": 152686675.0, + "step": 6022 + }, + { + "epoch": 0.6614320228420821, + "grad_norm": 2.291548490524292, + "learning_rate": 1e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.711284875869751, + "num_tokens": 152710221.0, + "step": 6023 + }, + { + "epoch": 0.6615418405446958, + "grad_norm": 2.1812779903411865, + "learning_rate": 1e-06, + "loss": 0.9746, + "mean_token_accuracy": 0.7022274136543274, + "num_tokens": 152737096.0, + "step": 6024 + }, + { + "epoch": 0.6616516582473094, + "grad_norm": 2.2401764392852783, + "learning_rate": 1e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.7274057865142822, + "num_tokens": 152760081.0, + "step": 6025 + }, + { + "epoch": 0.6617614759499231, + "grad_norm": 2.1381847858428955, + "learning_rate": 1e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7186095714569092, + "num_tokens": 152785585.0, + "step": 6026 + }, + { + "epoch": 0.6618712936525368, + "grad_norm": 2.2075581550598145, + "learning_rate": 1e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.7061449289321899, + "num_tokens": 152810992.0, + "step": 6027 + }, + { + "epoch": 0.6619811113551505, + "grad_norm": 2.4011995792388916, + "learning_rate": 1e-06, + "loss": 1.0155, + "mean_token_accuracy": 0.695078432559967, + "num_tokens": 152833613.0, + "step": 6028 + }, + { + "epoch": 0.6620909290577641, + "grad_norm": 2.199655294418335, + "learning_rate": 1e-06, + "loss": 1.0924, + "mean_token_accuracy": 0.675258219242096, + "num_tokens": 152861028.0, + "step": 6029 + }, + { + "epoch": 0.6622007467603778, + "grad_norm": 2.1527154445648193, + "learning_rate": 1e-06, + "loss": 0.9701, + "mean_token_accuracy": 0.7046746611595154, + "num_tokens": 152886855.0, + "step": 6030 + }, + { + "epoch": 0.6623105644629914, + "grad_norm": 2.4843859672546387, + "learning_rate": 1e-06, + "loss": 0.8936, + "mean_token_accuracy": 0.7202556729316711, + "num_tokens": 152906650.0, + "step": 6031 + }, + { + "epoch": 0.6624203821656051, + "grad_norm": 2.2178940773010254, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7156752347946167, + "num_tokens": 152930484.0, + "step": 6032 + }, + { + "epoch": 0.6625301998682187, + "grad_norm": 2.274691104888916, + "learning_rate": 1e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.6903890371322632, + "num_tokens": 152953497.0, + "step": 6033 + }, + { + "epoch": 0.6626400175708325, + "grad_norm": 1.8701633214950562, + "learning_rate": 1e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7104443311691284, + "num_tokens": 152986979.0, + "step": 6034 + }, + { + "epoch": 0.6627498352734461, + "grad_norm": 2.4210779666900635, + "learning_rate": 1e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.714594841003418, + "num_tokens": 153008251.0, + "step": 6035 + }, + { + "epoch": 0.6628596529760598, + "grad_norm": 2.1081454753875732, + "learning_rate": 1e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7282127141952515, + "num_tokens": 153034335.0, + "step": 6036 + }, + { + "epoch": 0.6629694706786734, + "grad_norm": 2.213758945465088, + "learning_rate": 1e-06, + "loss": 1.0312, + "mean_token_accuracy": 0.6961175799369812, + "num_tokens": 153060129.0, + "step": 6037 + }, + { + "epoch": 0.663079288381287, + "grad_norm": 1.9791182279586792, + "learning_rate": 1e-06, + "loss": 0.8871, + "mean_token_accuracy": 0.7256920337677002, + "num_tokens": 153088430.0, + "step": 6038 + }, + { + "epoch": 0.6631891060839007, + "grad_norm": 2.3751280307769775, + "learning_rate": 1e-06, + "loss": 0.8791, + "mean_token_accuracy": 0.7351356744766235, + "num_tokens": 153109968.0, + "step": 6039 + }, + { + "epoch": 0.6632989237865143, + "grad_norm": 2.1379477977752686, + "learning_rate": 1e-06, + "loss": 1.0476, + "mean_token_accuracy": 0.6834976673126221, + "num_tokens": 153138872.0, + "step": 6040 + }, + { + "epoch": 0.6634087414891281, + "grad_norm": 2.4738593101501465, + "learning_rate": 1e-06, + "loss": 0.8852, + "mean_token_accuracy": 0.7211048603057861, + "num_tokens": 153157721.0, + "step": 6041 + }, + { + "epoch": 0.6635185591917417, + "grad_norm": 2.098527193069458, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7115330696105957, + "num_tokens": 153187972.0, + "step": 6042 + }, + { + "epoch": 0.6636283768943554, + "grad_norm": 2.7120275497436523, + "learning_rate": 1e-06, + "loss": 0.8617, + "mean_token_accuracy": 0.7373373508453369, + "num_tokens": 153205911.0, + "step": 6043 + }, + { + "epoch": 0.663738194596969, + "grad_norm": 2.4310121536254883, + "learning_rate": 1e-06, + "loss": 0.9627, + "mean_token_accuracy": 0.707057774066925, + "num_tokens": 153227282.0, + "step": 6044 + }, + { + "epoch": 0.6638480122995827, + "grad_norm": 2.5931220054626465, + "learning_rate": 1e-06, + "loss": 0.8971, + "mean_token_accuracy": 0.7229920625686646, + "num_tokens": 153246579.0, + "step": 6045 + }, + { + "epoch": 0.6639578300021963, + "grad_norm": 2.1473350524902344, + "learning_rate": 1e-06, + "loss": 1.0076, + "mean_token_accuracy": 0.7106199264526367, + "num_tokens": 153273794.0, + "step": 6046 + }, + { + "epoch": 0.66406764770481, + "grad_norm": 2.284240245819092, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7180445194244385, + "num_tokens": 153297172.0, + "step": 6047 + }, + { + "epoch": 0.6641774654074237, + "grad_norm": 2.0688939094543457, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.7053377032279968, + "num_tokens": 153325164.0, + "step": 6048 + }, + { + "epoch": 0.6642872831100374, + "grad_norm": 2.8144335746765137, + "learning_rate": 1e-06, + "loss": 0.8706, + "mean_token_accuracy": 0.7219722270965576, + "num_tokens": 153342379.0, + "step": 6049 + }, + { + "epoch": 0.664397100812651, + "grad_norm": 2.1589741706848145, + "learning_rate": 1e-06, + "loss": 1.007, + "mean_token_accuracy": 0.6926467418670654, + "num_tokens": 153368243.0, + "step": 6050 + }, + { + "epoch": 0.6645069185152647, + "grad_norm": 2.1750450134277344, + "learning_rate": 1e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.713493287563324, + "num_tokens": 153393788.0, + "step": 6051 + }, + { + "epoch": 0.6646167362178783, + "grad_norm": 2.011220932006836, + "learning_rate": 1e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.7022706270217896, + "num_tokens": 153422848.0, + "step": 6052 + }, + { + "epoch": 0.664726553920492, + "grad_norm": 2.122809886932373, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7147396802902222, + "num_tokens": 153448707.0, + "step": 6053 + }, + { + "epoch": 0.6648363716231056, + "grad_norm": 2.1660492420196533, + "learning_rate": 1e-06, + "loss": 0.8895, + "mean_token_accuracy": 0.7214608192443848, + "num_tokens": 153471742.0, + "step": 6054 + }, + { + "epoch": 0.6649461893257194, + "grad_norm": 2.245666027069092, + "learning_rate": 1e-06, + "loss": 1.0358, + "mean_token_accuracy": 0.6843581199645996, + "num_tokens": 153498568.0, + "step": 6055 + }, + { + "epoch": 0.665056007028333, + "grad_norm": 1.8567396402359009, + "learning_rate": 1e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.6998099088668823, + "num_tokens": 153533857.0, + "step": 6056 + }, + { + "epoch": 0.6651658247309467, + "grad_norm": 2.5151596069335938, + "learning_rate": 1e-06, + "loss": 0.8738, + "mean_token_accuracy": 0.726702094078064, + "num_tokens": 153552427.0, + "step": 6057 + }, + { + "epoch": 0.6652756424335603, + "grad_norm": 2.289033889770508, + "learning_rate": 1e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.7245945930480957, + "num_tokens": 153573463.0, + "step": 6058 + }, + { + "epoch": 0.665385460136174, + "grad_norm": 2.2356746196746826, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7093007564544678, + "num_tokens": 153596839.0, + "step": 6059 + }, + { + "epoch": 0.6654952778387876, + "grad_norm": 2.2452642917633057, + "learning_rate": 1e-06, + "loss": 0.8465, + "mean_token_accuracy": 0.7266750931739807, + "num_tokens": 153619045.0, + "step": 6060 + }, + { + "epoch": 0.6656050955414012, + "grad_norm": 2.0428524017333984, + "learning_rate": 1e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.7249734401702881, + "num_tokens": 153645678.0, + "step": 6061 + }, + { + "epoch": 0.6657149132440149, + "grad_norm": 1.944753885269165, + "learning_rate": 1e-06, + "loss": 1.0518, + "mean_token_accuracy": 0.6789778470993042, + "num_tokens": 153676158.0, + "step": 6062 + }, + { + "epoch": 0.6658247309466286, + "grad_norm": 2.1452815532684326, + "learning_rate": 1e-06, + "loss": 1.042, + "mean_token_accuracy": 0.6906052827835083, + "num_tokens": 153703380.0, + "step": 6063 + }, + { + "epoch": 0.6659345486492423, + "grad_norm": 2.348130941390991, + "learning_rate": 1e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.7142660617828369, + "num_tokens": 153726199.0, + "step": 6064 + }, + { + "epoch": 0.6660443663518559, + "grad_norm": 2.4721856117248535, + "learning_rate": 1e-06, + "loss": 0.8834, + "mean_token_accuracy": 0.7295671701431274, + "num_tokens": 153747274.0, + "step": 6065 + }, + { + "epoch": 0.6661541840544696, + "grad_norm": 2.1739511489868164, + "learning_rate": 1e-06, + "loss": 0.8903, + "mean_token_accuracy": 0.7272161841392517, + "num_tokens": 153771402.0, + "step": 6066 + }, + { + "epoch": 0.6662640017570832, + "grad_norm": 2.383751153945923, + "learning_rate": 1e-06, + "loss": 0.9832, + "mean_token_accuracy": 0.7094005346298218, + "num_tokens": 153794722.0, + "step": 6067 + }, + { + "epoch": 0.6663738194596969, + "grad_norm": 2.4523472785949707, + "learning_rate": 1e-06, + "loss": 0.8317, + "mean_token_accuracy": 0.7421706914901733, + "num_tokens": 153815608.0, + "step": 6068 + }, + { + "epoch": 0.6664836371623105, + "grad_norm": 2.4687893390655518, + "learning_rate": 1e-06, + "loss": 1.0406, + "mean_token_accuracy": 0.6856561899185181, + "num_tokens": 153838015.0, + "step": 6069 + }, + { + "epoch": 0.6665934548649243, + "grad_norm": 1.9275964498519897, + "learning_rate": 1e-06, + "loss": 0.9733, + "mean_token_accuracy": 0.6973301768302917, + "num_tokens": 153867257.0, + "step": 6070 + }, + { + "epoch": 0.6667032725675379, + "grad_norm": 2.1849355697631836, + "learning_rate": 1e-06, + "loss": 0.8523, + "mean_token_accuracy": 0.7286384701728821, + "num_tokens": 153891300.0, + "step": 6071 + }, + { + "epoch": 0.6668130902701516, + "grad_norm": 1.9722371101379395, + "learning_rate": 1e-06, + "loss": 1.0236, + "mean_token_accuracy": 0.694920539855957, + "num_tokens": 153922316.0, + "step": 6072 + }, + { + "epoch": 0.6669229079727652, + "grad_norm": 1.936551809310913, + "learning_rate": 1e-06, + "loss": 1.102, + "mean_token_accuracy": 0.6680231690406799, + "num_tokens": 153956724.0, + "step": 6073 + }, + { + "epoch": 0.6670327256753789, + "grad_norm": 2.0942351818084717, + "learning_rate": 1e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.7161639928817749, + "num_tokens": 153985200.0, + "step": 6074 + }, + { + "epoch": 0.6671425433779925, + "grad_norm": 2.2395176887512207, + "learning_rate": 1e-06, + "loss": 1.0071, + "mean_token_accuracy": 0.6933526396751404, + "num_tokens": 154010241.0, + "step": 6075 + }, + { + "epoch": 0.6672523610806061, + "grad_norm": 2.2504725456237793, + "learning_rate": 1e-06, + "loss": 0.9529, + "mean_token_accuracy": 0.711664080619812, + "num_tokens": 154035022.0, + "step": 6076 + }, + { + "epoch": 0.6673621787832199, + "grad_norm": 2.094709634780884, + "learning_rate": 1e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7165436148643494, + "num_tokens": 154061639.0, + "step": 6077 + }, + { + "epoch": 0.6674719964858336, + "grad_norm": 2.3879594802856445, + "learning_rate": 1e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.695194661617279, + "num_tokens": 154084441.0, + "step": 6078 + }, + { + "epoch": 0.6675818141884472, + "grad_norm": 2.2978107929229736, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.704224705696106, + "num_tokens": 154107181.0, + "step": 6079 + }, + { + "epoch": 0.6676916318910608, + "grad_norm": 2.143209934234619, + "learning_rate": 1e-06, + "loss": 1.0148, + "mean_token_accuracy": 0.6903700828552246, + "num_tokens": 154133664.0, + "step": 6080 + }, + { + "epoch": 0.6678014495936745, + "grad_norm": 2.262852191925049, + "learning_rate": 1e-06, + "loss": 0.9788, + "mean_token_accuracy": 0.7111080288887024, + "num_tokens": 154157665.0, + "step": 6081 + }, + { + "epoch": 0.6679112672962881, + "grad_norm": 2.290480613708496, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.7008291482925415, + "num_tokens": 154182327.0, + "step": 6082 + }, + { + "epoch": 0.6680210849989018, + "grad_norm": 2.5005671977996826, + "learning_rate": 1e-06, + "loss": 1.0039, + "mean_token_accuracy": 0.6950415968894958, + "num_tokens": 154203764.0, + "step": 6083 + }, + { + "epoch": 0.6681309027015155, + "grad_norm": 2.274545192718506, + "learning_rate": 1e-06, + "loss": 0.9413, + "mean_token_accuracy": 0.714341402053833, + "num_tokens": 154229298.0, + "step": 6084 + }, + { + "epoch": 0.6682407204041292, + "grad_norm": 2.1303818225860596, + "learning_rate": 1e-06, + "loss": 1.0258, + "mean_token_accuracy": 0.6933460235595703, + "num_tokens": 154257377.0, + "step": 6085 + }, + { + "epoch": 0.6683505381067428, + "grad_norm": 2.279816150665283, + "learning_rate": 1e-06, + "loss": 0.9695, + "mean_token_accuracy": 0.7164516448974609, + "num_tokens": 154281633.0, + "step": 6086 + }, + { + "epoch": 0.6684603558093565, + "grad_norm": 1.904579758644104, + "learning_rate": 1e-06, + "loss": 0.9894, + "mean_token_accuracy": 0.7011341452598572, + "num_tokens": 154313463.0, + "step": 6087 + }, + { + "epoch": 0.6685701735119701, + "grad_norm": 2.0406556129455566, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.714432954788208, + "num_tokens": 154341119.0, + "step": 6088 + }, + { + "epoch": 0.6686799912145838, + "grad_norm": 2.383817672729492, + "learning_rate": 1e-06, + "loss": 0.935, + "mean_token_accuracy": 0.7137014865875244, + "num_tokens": 154362867.0, + "step": 6089 + }, + { + "epoch": 0.6687898089171974, + "grad_norm": 2.004056930541992, + "learning_rate": 1e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.6992157697677612, + "num_tokens": 154393928.0, + "step": 6090 + }, + { + "epoch": 0.6688996266198111, + "grad_norm": 2.0112977027893066, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.6935364603996277, + "num_tokens": 154424790.0, + "step": 6091 + }, + { + "epoch": 0.6690094443224248, + "grad_norm": 2.1928865909576416, + "learning_rate": 1e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.7111824154853821, + "num_tokens": 154449905.0, + "step": 6092 + }, + { + "epoch": 0.6691192620250385, + "grad_norm": 2.3083486557006836, + "learning_rate": 1e-06, + "loss": 1.0276, + "mean_token_accuracy": 0.6905459761619568, + "num_tokens": 154473921.0, + "step": 6093 + }, + { + "epoch": 0.6692290797276521, + "grad_norm": 2.0236551761627197, + "learning_rate": 1e-06, + "loss": 0.8825, + "mean_token_accuracy": 0.722701370716095, + "num_tokens": 154501449.0, + "step": 6094 + }, + { + "epoch": 0.6693388974302658, + "grad_norm": 2.405857801437378, + "learning_rate": 1e-06, + "loss": 0.8328, + "mean_token_accuracy": 0.7367047667503357, + "num_tokens": 154521442.0, + "step": 6095 + }, + { + "epoch": 0.6694487151328794, + "grad_norm": 2.1120901107788086, + "learning_rate": 1e-06, + "loss": 1.0315, + "mean_token_accuracy": 0.6903658509254456, + "num_tokens": 154551181.0, + "step": 6096 + }, + { + "epoch": 0.669558532835493, + "grad_norm": 2.0608255863189697, + "learning_rate": 1e-06, + "loss": 0.9708, + "mean_token_accuracy": 0.7109904289245605, + "num_tokens": 154578778.0, + "step": 6097 + }, + { + "epoch": 0.6696683505381067, + "grad_norm": 2.004002332687378, + "learning_rate": 1e-06, + "loss": 1.0453, + "mean_token_accuracy": 0.6958333849906921, + "num_tokens": 154608617.0, + "step": 6098 + }, + { + "epoch": 0.6697781682407205, + "grad_norm": 1.9897726774215698, + "learning_rate": 1e-06, + "loss": 1.0193, + "mean_token_accuracy": 0.689023494720459, + "num_tokens": 154638910.0, + "step": 6099 + }, + { + "epoch": 0.6698879859433341, + "grad_norm": 2.3084497451782227, + "learning_rate": 1e-06, + "loss": 1.0709, + "mean_token_accuracy": 0.6716408729553223, + "num_tokens": 154664574.0, + "step": 6100 + }, + { + "epoch": 0.6699978036459477, + "grad_norm": 2.0555317401885986, + "learning_rate": 1e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.6970680356025696, + "num_tokens": 154691786.0, + "step": 6101 + }, + { + "epoch": 0.6701076213485614, + "grad_norm": 2.492278575897217, + "learning_rate": 1e-06, + "loss": 0.8934, + "mean_token_accuracy": 0.7284008860588074, + "num_tokens": 154710879.0, + "step": 6102 + }, + { + "epoch": 0.670217439051175, + "grad_norm": 2.3605782985687256, + "learning_rate": 1e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.7182503938674927, + "num_tokens": 154732586.0, + "step": 6103 + }, + { + "epoch": 0.6703272567537887, + "grad_norm": 2.208756446838379, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.7046754360198975, + "num_tokens": 154757714.0, + "step": 6104 + }, + { + "epoch": 0.6704370744564023, + "grad_norm": 2.5295019149780273, + "learning_rate": 1e-06, + "loss": 0.841, + "mean_token_accuracy": 0.7412089109420776, + "num_tokens": 154777619.0, + "step": 6105 + }, + { + "epoch": 0.6705468921590161, + "grad_norm": 2.4017295837402344, + "learning_rate": 1e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.7134260535240173, + "num_tokens": 154797779.0, + "step": 6106 + }, + { + "epoch": 0.6706567098616297, + "grad_norm": 2.4210753440856934, + "learning_rate": 1e-06, + "loss": 0.9056, + "mean_token_accuracy": 0.7205959558486938, + "num_tokens": 154819469.0, + "step": 6107 + }, + { + "epoch": 0.6707665275642434, + "grad_norm": 2.3388383388519287, + "learning_rate": 1e-06, + "loss": 0.9708, + "mean_token_accuracy": 0.7040737867355347, + "num_tokens": 154842200.0, + "step": 6108 + }, + { + "epoch": 0.670876345266857, + "grad_norm": 2.2130281925201416, + "learning_rate": 1e-06, + "loss": 0.9935, + "mean_token_accuracy": 0.6970620155334473, + "num_tokens": 154867097.0, + "step": 6109 + }, + { + "epoch": 0.6709861629694707, + "grad_norm": 2.4276235103607178, + "learning_rate": 1e-06, + "loss": 1.071, + "mean_token_accuracy": 0.6833465099334717, + "num_tokens": 154889282.0, + "step": 6110 + }, + { + "epoch": 0.6710959806720843, + "grad_norm": 2.076134443283081, + "learning_rate": 1e-06, + "loss": 0.9838, + "mean_token_accuracy": 0.6966794729232788, + "num_tokens": 154917734.0, + "step": 6111 + }, + { + "epoch": 0.671205798374698, + "grad_norm": 2.1681690216064453, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.7014077305793762, + "num_tokens": 154945418.0, + "step": 6112 + }, + { + "epoch": 0.6713156160773117, + "grad_norm": 2.170090675354004, + "learning_rate": 1e-06, + "loss": 1.009, + "mean_token_accuracy": 0.6933061480522156, + "num_tokens": 154971755.0, + "step": 6113 + }, + { + "epoch": 0.6714254337799254, + "grad_norm": 2.3749706745147705, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7193120718002319, + "num_tokens": 154992910.0, + "step": 6114 + }, + { + "epoch": 0.671535251482539, + "grad_norm": 2.405247449874878, + "learning_rate": 1e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7233669757843018, + "num_tokens": 155014189.0, + "step": 6115 + }, + { + "epoch": 0.6716450691851527, + "grad_norm": 2.1313316822052, + "learning_rate": 1e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.7216998934745789, + "num_tokens": 155039145.0, + "step": 6116 + }, + { + "epoch": 0.6717548868877663, + "grad_norm": 2.4677064418792725, + "learning_rate": 1e-06, + "loss": 0.9162, + "mean_token_accuracy": 0.7187520265579224, + "num_tokens": 155058574.0, + "step": 6117 + }, + { + "epoch": 0.67186470459038, + "grad_norm": 2.2153966426849365, + "learning_rate": 1e-06, + "loss": 1.0091, + "mean_token_accuracy": 0.7069697976112366, + "num_tokens": 155084013.0, + "step": 6118 + }, + { + "epoch": 0.6719745222929936, + "grad_norm": 2.14631986618042, + "learning_rate": 1e-06, + "loss": 0.96, + "mean_token_accuracy": 0.7055056095123291, + "num_tokens": 155108818.0, + "step": 6119 + }, + { + "epoch": 0.6720843399956072, + "grad_norm": 1.9250919818878174, + "learning_rate": 1e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.7115698456764221, + "num_tokens": 155140633.0, + "step": 6120 + }, + { + "epoch": 0.672194157698221, + "grad_norm": 2.1100406646728516, + "learning_rate": 1e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7097413539886475, + "num_tokens": 155166594.0, + "step": 6121 + }, + { + "epoch": 0.6723039754008346, + "grad_norm": 2.2002062797546387, + "learning_rate": 1e-06, + "loss": 0.9012, + "mean_token_accuracy": 0.718215823173523, + "num_tokens": 155190167.0, + "step": 6122 + }, + { + "epoch": 0.6724137931034483, + "grad_norm": 2.3672642707824707, + "learning_rate": 1e-06, + "loss": 0.994, + "mean_token_accuracy": 0.7045305371284485, + "num_tokens": 155213707.0, + "step": 6123 + }, + { + "epoch": 0.6725236108060619, + "grad_norm": 2.4079244136810303, + "learning_rate": 1e-06, + "loss": 0.8297, + "mean_token_accuracy": 0.740327775478363, + "num_tokens": 155232742.0, + "step": 6124 + }, + { + "epoch": 0.6726334285086756, + "grad_norm": 2.0924453735351562, + "learning_rate": 1e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.702580451965332, + "num_tokens": 155260127.0, + "step": 6125 + }, + { + "epoch": 0.6727432462112892, + "grad_norm": 2.5319528579711914, + "learning_rate": 1e-06, + "loss": 0.8777, + "mean_token_accuracy": 0.7262556552886963, + "num_tokens": 155278733.0, + "step": 6126 + }, + { + "epoch": 0.6728530639139029, + "grad_norm": 2.1916160583496094, + "learning_rate": 1e-06, + "loss": 0.8919, + "mean_token_accuracy": 0.7326556444168091, + "num_tokens": 155302845.0, + "step": 6127 + }, + { + "epoch": 0.6729628816165166, + "grad_norm": 2.3758926391601562, + "learning_rate": 1e-06, + "loss": 0.8652, + "mean_token_accuracy": 0.7328174710273743, + "num_tokens": 155324315.0, + "step": 6128 + }, + { + "epoch": 0.6730726993191303, + "grad_norm": 2.055121898651123, + "learning_rate": 1e-06, + "loss": 0.9657, + "mean_token_accuracy": 0.706967830657959, + "num_tokens": 155352174.0, + "step": 6129 + }, + { + "epoch": 0.6731825170217439, + "grad_norm": 2.373785972595215, + "learning_rate": 1e-06, + "loss": 1.0118, + "mean_token_accuracy": 0.6957765817642212, + "num_tokens": 155377145.0, + "step": 6130 + }, + { + "epoch": 0.6732923347243576, + "grad_norm": 2.3051464557647705, + "learning_rate": 1e-06, + "loss": 0.8343, + "mean_token_accuracy": 0.7438571453094482, + "num_tokens": 155397572.0, + "step": 6131 + }, + { + "epoch": 0.6734021524269712, + "grad_norm": 2.2847342491149902, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.7058191895484924, + "num_tokens": 155421919.0, + "step": 6132 + }, + { + "epoch": 0.6735119701295849, + "grad_norm": 2.6214680671691895, + "learning_rate": 1e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.7192425727844238, + "num_tokens": 155440616.0, + "step": 6133 + }, + { + "epoch": 0.6736217878321985, + "grad_norm": 2.1331350803375244, + "learning_rate": 1e-06, + "loss": 1.0378, + "mean_token_accuracy": 0.6931447982788086, + "num_tokens": 155469455.0, + "step": 6134 + }, + { + "epoch": 0.6737316055348123, + "grad_norm": 2.0788795948028564, + "learning_rate": 1e-06, + "loss": 0.9045, + "mean_token_accuracy": 0.7190643548965454, + "num_tokens": 155496870.0, + "step": 6135 + }, + { + "epoch": 0.6738414232374259, + "grad_norm": 2.391880989074707, + "learning_rate": 1e-06, + "loss": 0.9253, + "mean_token_accuracy": 0.7184438705444336, + "num_tokens": 155518621.0, + "step": 6136 + }, + { + "epoch": 0.6739512409400396, + "grad_norm": 2.4866607189178467, + "learning_rate": 1e-06, + "loss": 1.0262, + "mean_token_accuracy": 0.6836262345314026, + "num_tokens": 155539724.0, + "step": 6137 + }, + { + "epoch": 0.6740610586426532, + "grad_norm": 2.4320313930511475, + "learning_rate": 1e-06, + "loss": 0.8815, + "mean_token_accuracy": 0.7256580591201782, + "num_tokens": 155560809.0, + "step": 6138 + }, + { + "epoch": 0.6741708763452668, + "grad_norm": 2.2727115154266357, + "learning_rate": 1e-06, + "loss": 1.0666, + "mean_token_accuracy": 0.6818839311599731, + "num_tokens": 155584850.0, + "step": 6139 + }, + { + "epoch": 0.6742806940478805, + "grad_norm": 2.2913248538970947, + "learning_rate": 1e-06, + "loss": 1.0156, + "mean_token_accuracy": 0.6943912506103516, + "num_tokens": 155609352.0, + "step": 6140 + }, + { + "epoch": 0.6743905117504941, + "grad_norm": 2.175218343734741, + "learning_rate": 1e-06, + "loss": 0.8799, + "mean_token_accuracy": 0.7314267754554749, + "num_tokens": 155635230.0, + "step": 6141 + }, + { + "epoch": 0.6745003294531079, + "grad_norm": 2.13409423828125, + "learning_rate": 1e-06, + "loss": 1.0503, + "mean_token_accuracy": 0.6821754574775696, + "num_tokens": 155665054.0, + "step": 6142 + }, + { + "epoch": 0.6746101471557215, + "grad_norm": 2.1974685192108154, + "learning_rate": 1e-06, + "loss": 1.0334, + "mean_token_accuracy": 0.6853673458099365, + "num_tokens": 155690540.0, + "step": 6143 + }, + { + "epoch": 0.6747199648583352, + "grad_norm": 2.4227805137634277, + "learning_rate": 1e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.7035444378852844, + "num_tokens": 155712919.0, + "step": 6144 + }, + { + "epoch": 0.6748297825609488, + "grad_norm": 2.4461863040924072, + "learning_rate": 1e-06, + "loss": 0.8905, + "mean_token_accuracy": 0.7208433747291565, + "num_tokens": 155732459.0, + "step": 6145 + }, + { + "epoch": 0.6749396002635625, + "grad_norm": 2.3761093616485596, + "learning_rate": 1e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7095943093299866, + "num_tokens": 155754973.0, + "step": 6146 + }, + { + "epoch": 0.6750494179661761, + "grad_norm": 2.007986545562744, + "learning_rate": 1e-06, + "loss": 1.0018, + "mean_token_accuracy": 0.7005046606063843, + "num_tokens": 155785730.0, + "step": 6147 + }, + { + "epoch": 0.6751592356687898, + "grad_norm": 2.3044865131378174, + "learning_rate": 1e-06, + "loss": 0.9479, + "mean_token_accuracy": 0.7065572738647461, + "num_tokens": 155808222.0, + "step": 6148 + }, + { + "epoch": 0.6752690533714034, + "grad_norm": 2.012057304382324, + "learning_rate": 1e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.7135294079780579, + "num_tokens": 155838215.0, + "step": 6149 + }, + { + "epoch": 0.6753788710740172, + "grad_norm": 2.374276638031006, + "learning_rate": 1e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7289470434188843, + "num_tokens": 155860554.0, + "step": 6150 + }, + { + "epoch": 0.6754886887766308, + "grad_norm": 2.2966904640197754, + "learning_rate": 1e-06, + "loss": 0.9924, + "mean_token_accuracy": 0.6956168413162231, + "num_tokens": 155884377.0, + "step": 6151 + }, + { + "epoch": 0.6755985064792445, + "grad_norm": 2.0129477977752686, + "learning_rate": 1e-06, + "loss": 0.9346, + "mean_token_accuracy": 0.7147045731544495, + "num_tokens": 155915901.0, + "step": 6152 + }, + { + "epoch": 0.6757083241818581, + "grad_norm": 2.2697484493255615, + "learning_rate": 1e-06, + "loss": 0.9657, + "mean_token_accuracy": 0.7048358917236328, + "num_tokens": 155939423.0, + "step": 6153 + }, + { + "epoch": 0.6758181418844718, + "grad_norm": 2.013794183731079, + "learning_rate": 1e-06, + "loss": 1.0169, + "mean_token_accuracy": 0.69586181640625, + "num_tokens": 155969255.0, + "step": 6154 + }, + { + "epoch": 0.6759279595870854, + "grad_norm": 2.5840578079223633, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7187868356704712, + "num_tokens": 155988662.0, + "step": 6155 + }, + { + "epoch": 0.676037777289699, + "grad_norm": 2.1065566539764404, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7060911655426025, + "num_tokens": 156015381.0, + "step": 6156 + }, + { + "epoch": 0.6761475949923128, + "grad_norm": 2.3648841381073, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.7010147571563721, + "num_tokens": 156038538.0, + "step": 6157 + }, + { + "epoch": 0.6762574126949265, + "grad_norm": 1.9794386625289917, + "learning_rate": 1e-06, + "loss": 1.0347, + "mean_token_accuracy": 0.6889488101005554, + "num_tokens": 156071353.0, + "step": 6158 + }, + { + "epoch": 0.6763672303975401, + "grad_norm": 2.406467914581299, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7204846739768982, + "num_tokens": 156092141.0, + "step": 6159 + }, + { + "epoch": 0.6764770481001537, + "grad_norm": 2.40783429145813, + "learning_rate": 1e-06, + "loss": 0.925, + "mean_token_accuracy": 0.7162853479385376, + "num_tokens": 156113360.0, + "step": 6160 + }, + { + "epoch": 0.6765868658027674, + "grad_norm": 2.2010343074798584, + "learning_rate": 1e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.701655387878418, + "num_tokens": 156138225.0, + "step": 6161 + }, + { + "epoch": 0.676696683505381, + "grad_norm": 1.991849660873413, + "learning_rate": 1e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7104180455207825, + "num_tokens": 156168313.0, + "step": 6162 + }, + { + "epoch": 0.6768065012079947, + "grad_norm": 2.3556435108184814, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7065759301185608, + "num_tokens": 156190914.0, + "step": 6163 + }, + { + "epoch": 0.6769163189106084, + "grad_norm": 2.37168288230896, + "learning_rate": 1e-06, + "loss": 0.9247, + "mean_token_accuracy": 0.7169957160949707, + "num_tokens": 156214137.0, + "step": 6164 + }, + { + "epoch": 0.6770261366132221, + "grad_norm": 2.339423656463623, + "learning_rate": 1e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.697048544883728, + "num_tokens": 156238386.0, + "step": 6165 + }, + { + "epoch": 0.6771359543158357, + "grad_norm": 2.296011209487915, + "learning_rate": 1e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.7035401463508606, + "num_tokens": 156262994.0, + "step": 6166 + }, + { + "epoch": 0.6772457720184494, + "grad_norm": 1.8848968744277954, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7176277041435242, + "num_tokens": 156295899.0, + "step": 6167 + }, + { + "epoch": 0.677355589721063, + "grad_norm": 2.0677201747894287, + "learning_rate": 1e-06, + "loss": 1.0732, + "mean_token_accuracy": 0.6803975105285645, + "num_tokens": 156326216.0, + "step": 6168 + }, + { + "epoch": 0.6774654074236767, + "grad_norm": 1.9358866214752197, + "learning_rate": 1e-06, + "loss": 1.009, + "mean_token_accuracy": 0.6903108954429626, + "num_tokens": 156358574.0, + "step": 6169 + }, + { + "epoch": 0.6775752251262903, + "grad_norm": 2.2958126068115234, + "learning_rate": 1e-06, + "loss": 0.8521, + "mean_token_accuracy": 0.7265881299972534, + "num_tokens": 156381247.0, + "step": 6170 + }, + { + "epoch": 0.6776850428289041, + "grad_norm": 2.1450390815734863, + "learning_rate": 1e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.7177792191505432, + "num_tokens": 156405830.0, + "step": 6171 + }, + { + "epoch": 0.6777948605315177, + "grad_norm": 1.878587007522583, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.702635645866394, + "num_tokens": 156438979.0, + "step": 6172 + }, + { + "epoch": 0.6779046782341314, + "grad_norm": 2.289578676223755, + "learning_rate": 1e-06, + "loss": 1.0484, + "mean_token_accuracy": 0.6851690411567688, + "num_tokens": 156463818.0, + "step": 6173 + }, + { + "epoch": 0.678014495936745, + "grad_norm": 2.013699531555176, + "learning_rate": 1e-06, + "loss": 1.001, + "mean_token_accuracy": 0.7018672227859497, + "num_tokens": 156493709.0, + "step": 6174 + }, + { + "epoch": 0.6781243136393587, + "grad_norm": 1.9930979013442993, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.7106108069419861, + "num_tokens": 156524806.0, + "step": 6175 + }, + { + "epoch": 0.6782341313419723, + "grad_norm": 2.2055368423461914, + "learning_rate": 1e-06, + "loss": 0.8881, + "mean_token_accuracy": 0.7251639366149902, + "num_tokens": 156548602.0, + "step": 6176 + }, + { + "epoch": 0.678343949044586, + "grad_norm": 1.915570855140686, + "learning_rate": 1e-06, + "loss": 0.9953, + "mean_token_accuracy": 0.7044914960861206, + "num_tokens": 156579928.0, + "step": 6177 + }, + { + "epoch": 0.6784537667471996, + "grad_norm": 2.158709764480591, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.7131354212760925, + "num_tokens": 156604204.0, + "step": 6178 + }, + { + "epoch": 0.6785635844498134, + "grad_norm": 2.1967639923095703, + "learning_rate": 1e-06, + "loss": 1.051, + "mean_token_accuracy": 0.6837363839149475, + "num_tokens": 156631632.0, + "step": 6179 + }, + { + "epoch": 0.678673402152427, + "grad_norm": 2.5200088024139404, + "learning_rate": 1e-06, + "loss": 0.8764, + "mean_token_accuracy": 0.7334586977958679, + "num_tokens": 156651195.0, + "step": 6180 + }, + { + "epoch": 0.6787832198550406, + "grad_norm": 1.9208799600601196, + "learning_rate": 1e-06, + "loss": 0.959, + "mean_token_accuracy": 0.7111326456069946, + "num_tokens": 156681927.0, + "step": 6181 + }, + { + "epoch": 0.6788930375576543, + "grad_norm": 2.256910800933838, + "learning_rate": 1e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.706960916519165, + "num_tokens": 156705342.0, + "step": 6182 + }, + { + "epoch": 0.6790028552602679, + "grad_norm": 2.1954033374786377, + "learning_rate": 1e-06, + "loss": 0.9462, + "mean_token_accuracy": 0.7071995735168457, + "num_tokens": 156731165.0, + "step": 6183 + }, + { + "epoch": 0.6791126729628816, + "grad_norm": 2.3903212547302246, + "learning_rate": 1e-06, + "loss": 0.8897, + "mean_token_accuracy": 0.726494550704956, + "num_tokens": 156752732.0, + "step": 6184 + }, + { + "epoch": 0.6792224906654952, + "grad_norm": 2.0322258472442627, + "learning_rate": 1e-06, + "loss": 1.0463, + "mean_token_accuracy": 0.6925579309463501, + "num_tokens": 156783283.0, + "step": 6185 + }, + { + "epoch": 0.679332308368109, + "grad_norm": 2.127552032470703, + "learning_rate": 1e-06, + "loss": 1.0384, + "mean_token_accuracy": 0.6853510141372681, + "num_tokens": 156812074.0, + "step": 6186 + }, + { + "epoch": 0.6794421260707226, + "grad_norm": 2.166257619857788, + "learning_rate": 1e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.7153536081314087, + "num_tokens": 156835533.0, + "step": 6187 + }, + { + "epoch": 0.6795519437733363, + "grad_norm": 2.22257399559021, + "learning_rate": 1e-06, + "loss": 0.7736, + "mean_token_accuracy": 0.7526231408119202, + "num_tokens": 156857399.0, + "step": 6188 + }, + { + "epoch": 0.6796617614759499, + "grad_norm": 2.137601852416992, + "learning_rate": 1e-06, + "loss": 0.9027, + "mean_token_accuracy": 0.7166378498077393, + "num_tokens": 156884151.0, + "step": 6189 + }, + { + "epoch": 0.6797715791785636, + "grad_norm": 2.3329215049743652, + "learning_rate": 1e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.7197824716567993, + "num_tokens": 156907163.0, + "step": 6190 + }, + { + "epoch": 0.6798813968811772, + "grad_norm": 2.428950071334839, + "learning_rate": 1e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.722887396812439, + "num_tokens": 156928545.0, + "step": 6191 + }, + { + "epoch": 0.6799912145837909, + "grad_norm": 2.3439531326293945, + "learning_rate": 1e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7091655135154724, + "num_tokens": 156950713.0, + "step": 6192 + }, + { + "epoch": 0.6801010322864046, + "grad_norm": 2.2612738609313965, + "learning_rate": 1e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.7170946598052979, + "num_tokens": 156973840.0, + "step": 6193 + }, + { + "epoch": 0.6802108499890183, + "grad_norm": 2.1783556938171387, + "learning_rate": 1e-06, + "loss": 0.9746, + "mean_token_accuracy": 0.6944631338119507, + "num_tokens": 156999836.0, + "step": 6194 + }, + { + "epoch": 0.6803206676916319, + "grad_norm": 2.08735728263855, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.697401762008667, + "num_tokens": 157027914.0, + "step": 6195 + }, + { + "epoch": 0.6804304853942456, + "grad_norm": 2.0931036472320557, + "learning_rate": 1e-06, + "loss": 0.9953, + "mean_token_accuracy": 0.6977595090866089, + "num_tokens": 157056979.0, + "step": 6196 + }, + { + "epoch": 0.6805403030968592, + "grad_norm": 2.1674644947052, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.697377622127533, + "num_tokens": 157082172.0, + "step": 6197 + }, + { + "epoch": 0.6806501207994728, + "grad_norm": 2.1225969791412354, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.7087332010269165, + "num_tokens": 157108384.0, + "step": 6198 + }, + { + "epoch": 0.6807599385020865, + "grad_norm": 2.6115355491638184, + "learning_rate": 1e-06, + "loss": 1.0037, + "mean_token_accuracy": 0.6981430053710938, + "num_tokens": 157130007.0, + "step": 6199 + }, + { + "epoch": 0.6808697562047002, + "grad_norm": 2.2678439617156982, + "learning_rate": 1e-06, + "loss": 0.9952, + "mean_token_accuracy": 0.6970165967941284, + "num_tokens": 157154757.0, + "step": 6200 + }, + { + "epoch": 0.6809795739073139, + "grad_norm": 1.9428516626358032, + "learning_rate": 1e-06, + "loss": 1.0256, + "mean_token_accuracy": 0.689304530620575, + "num_tokens": 157185717.0, + "step": 6201 + }, + { + "epoch": 0.6810893916099275, + "grad_norm": 2.343296766281128, + "learning_rate": 1e-06, + "loss": 0.8635, + "mean_token_accuracy": 0.7298728227615356, + "num_tokens": 157207824.0, + "step": 6202 + }, + { + "epoch": 0.6811992093125412, + "grad_norm": 2.133519411087036, + "learning_rate": 1e-06, + "loss": 1.0242, + "mean_token_accuracy": 0.6909117698669434, + "num_tokens": 157236410.0, + "step": 6203 + }, + { + "epoch": 0.6813090270151548, + "grad_norm": 2.192877769470215, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.710824728012085, + "num_tokens": 157260885.0, + "step": 6204 + }, + { + "epoch": 0.6814188447177685, + "grad_norm": 2.181919813156128, + "learning_rate": 1e-06, + "loss": 0.9679, + "mean_token_accuracy": 0.704861044883728, + "num_tokens": 157285028.0, + "step": 6205 + }, + { + "epoch": 0.6815286624203821, + "grad_norm": 1.9829308986663818, + "learning_rate": 1e-06, + "loss": 1.059, + "mean_token_accuracy": 0.6828781366348267, + "num_tokens": 157314611.0, + "step": 6206 + }, + { + "epoch": 0.6816384801229959, + "grad_norm": 2.183450937271118, + "learning_rate": 1e-06, + "loss": 1.0391, + "mean_token_accuracy": 0.6845300793647766, + "num_tokens": 157340461.0, + "step": 6207 + }, + { + "epoch": 0.6817482978256095, + "grad_norm": 2.1528573036193848, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.7115784883499146, + "num_tokens": 157367574.0, + "step": 6208 + }, + { + "epoch": 0.6818581155282232, + "grad_norm": 2.1842167377471924, + "learning_rate": 1e-06, + "loss": 0.8157, + "mean_token_accuracy": 0.7379909753799438, + "num_tokens": 157391924.0, + "step": 6209 + }, + { + "epoch": 0.6819679332308368, + "grad_norm": 2.105814218521118, + "learning_rate": 1e-06, + "loss": 1.011, + "mean_token_accuracy": 0.6940677165985107, + "num_tokens": 157418477.0, + "step": 6210 + }, + { + "epoch": 0.6820777509334505, + "grad_norm": 2.176694393157959, + "learning_rate": 1e-06, + "loss": 1.0462, + "mean_token_accuracy": 0.6832354068756104, + "num_tokens": 157444753.0, + "step": 6211 + }, + { + "epoch": 0.6821875686360641, + "grad_norm": 2.2611773014068604, + "learning_rate": 1e-06, + "loss": 1.0085, + "mean_token_accuracy": 0.6900920867919922, + "num_tokens": 157470281.0, + "step": 6212 + }, + { + "epoch": 0.6822973863386778, + "grad_norm": 2.3226144313812256, + "learning_rate": 1e-06, + "loss": 1.0152, + "mean_token_accuracy": 0.6901710033416748, + "num_tokens": 157494028.0, + "step": 6213 + }, + { + "epoch": 0.6824072040412914, + "grad_norm": 2.321852207183838, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7069741487503052, + "num_tokens": 157517399.0, + "step": 6214 + }, + { + "epoch": 0.6825170217439052, + "grad_norm": 2.4177873134613037, + "learning_rate": 1e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.7312943339347839, + "num_tokens": 157537904.0, + "step": 6215 + }, + { + "epoch": 0.6826268394465188, + "grad_norm": 2.5244851112365723, + "learning_rate": 1e-06, + "loss": 0.8572, + "mean_token_accuracy": 0.7341760396957397, + "num_tokens": 157556377.0, + "step": 6216 + }, + { + "epoch": 0.6827366571491325, + "grad_norm": 2.56467604637146, + "learning_rate": 1e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.7127450108528137, + "num_tokens": 157577334.0, + "step": 6217 + }, + { + "epoch": 0.6828464748517461, + "grad_norm": 2.024097204208374, + "learning_rate": 1e-06, + "loss": 1.0704, + "mean_token_accuracy": 0.6802039742469788, + "num_tokens": 157608672.0, + "step": 6218 + }, + { + "epoch": 0.6829562925543597, + "grad_norm": 2.3836829662323, + "learning_rate": 1e-06, + "loss": 0.9182, + "mean_token_accuracy": 0.7164677381515503, + "num_tokens": 157630532.0, + "step": 6219 + }, + { + "epoch": 0.6830661102569734, + "grad_norm": 2.3508875370025635, + "learning_rate": 1e-06, + "loss": 0.976, + "mean_token_accuracy": 0.7037372589111328, + "num_tokens": 157653088.0, + "step": 6220 + }, + { + "epoch": 0.683175927959587, + "grad_norm": 1.919156789779663, + "learning_rate": 1e-06, + "loss": 1.0235, + "mean_token_accuracy": 0.6958478689193726, + "num_tokens": 157684627.0, + "step": 6221 + }, + { + "epoch": 0.6832857456622008, + "grad_norm": 2.144627332687378, + "learning_rate": 1e-06, + "loss": 0.9122, + "mean_token_accuracy": 0.7176413536071777, + "num_tokens": 157712326.0, + "step": 6222 + }, + { + "epoch": 0.6833955633648144, + "grad_norm": 2.3800816535949707, + "learning_rate": 1e-06, + "loss": 0.9224, + "mean_token_accuracy": 0.7174738645553589, + "num_tokens": 157733477.0, + "step": 6223 + }, + { + "epoch": 0.6835053810674281, + "grad_norm": 2.354971170425415, + "learning_rate": 1e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.7222625613212585, + "num_tokens": 157757265.0, + "step": 6224 + }, + { + "epoch": 0.6836151987700417, + "grad_norm": 2.353916645050049, + "learning_rate": 1e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.7123434543609619, + "num_tokens": 157778889.0, + "step": 6225 + }, + { + "epoch": 0.6837250164726554, + "grad_norm": 2.2645211219787598, + "learning_rate": 1e-06, + "loss": 0.9863, + "mean_token_accuracy": 0.701941728591919, + "num_tokens": 157802738.0, + "step": 6226 + }, + { + "epoch": 0.683834834175269, + "grad_norm": 2.143653631210327, + "learning_rate": 1e-06, + "loss": 1.0337, + "mean_token_accuracy": 0.6846807599067688, + "num_tokens": 157831338.0, + "step": 6227 + }, + { + "epoch": 0.6839446518778827, + "grad_norm": 2.218899965286255, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7198106050491333, + "num_tokens": 157854455.0, + "step": 6228 + }, + { + "epoch": 0.6840544695804964, + "grad_norm": 2.106781482696533, + "learning_rate": 1e-06, + "loss": 0.9565, + "mean_token_accuracy": 0.7094824314117432, + "num_tokens": 157881290.0, + "step": 6229 + }, + { + "epoch": 0.6841642872831101, + "grad_norm": 1.9874975681304932, + "learning_rate": 1e-06, + "loss": 0.9101, + "mean_token_accuracy": 0.7185678482055664, + "num_tokens": 157909791.0, + "step": 6230 + }, + { + "epoch": 0.6842741049857237, + "grad_norm": 2.20477294921875, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7232551574707031, + "num_tokens": 157936312.0, + "step": 6231 + }, + { + "epoch": 0.6843839226883374, + "grad_norm": 1.960361361503601, + "learning_rate": 1e-06, + "loss": 0.9989, + "mean_token_accuracy": 0.6908582448959351, + "num_tokens": 157968869.0, + "step": 6232 + }, + { + "epoch": 0.684493740390951, + "grad_norm": 2.5505168437957764, + "learning_rate": 1e-06, + "loss": 0.8648, + "mean_token_accuracy": 0.7294111847877502, + "num_tokens": 157988198.0, + "step": 6233 + }, + { + "epoch": 0.6846035580935647, + "grad_norm": 2.2382607460021973, + "learning_rate": 1e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7177004814147949, + "num_tokens": 158011454.0, + "step": 6234 + }, + { + "epoch": 0.6847133757961783, + "grad_norm": 2.387383460998535, + "learning_rate": 1e-06, + "loss": 0.9738, + "mean_token_accuracy": 0.7019346356391907, + "num_tokens": 158033929.0, + "step": 6235 + }, + { + "epoch": 0.6848231934987921, + "grad_norm": 2.190389633178711, + "learning_rate": 1e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.7204231023788452, + "num_tokens": 158058011.0, + "step": 6236 + }, + { + "epoch": 0.6849330112014057, + "grad_norm": 2.290959596633911, + "learning_rate": 1e-06, + "loss": 0.8846, + "mean_token_accuracy": 0.7273998260498047, + "num_tokens": 158080234.0, + "step": 6237 + }, + { + "epoch": 0.6850428289040194, + "grad_norm": 2.1936709880828857, + "learning_rate": 1e-06, + "loss": 0.8818, + "mean_token_accuracy": 0.7282887697219849, + "num_tokens": 158104472.0, + "step": 6238 + }, + { + "epoch": 0.685152646606633, + "grad_norm": 2.4298274517059326, + "learning_rate": 1e-06, + "loss": 0.9487, + "mean_token_accuracy": 0.7094215154647827, + "num_tokens": 158126524.0, + "step": 6239 + }, + { + "epoch": 0.6852624643092466, + "grad_norm": 2.2532660961151123, + "learning_rate": 1e-06, + "loss": 0.9182, + "mean_token_accuracy": 0.709704577922821, + "num_tokens": 158151229.0, + "step": 6240 + }, + { + "epoch": 0.6853722820118603, + "grad_norm": 2.0322515964508057, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7118415832519531, + "num_tokens": 158178829.0, + "step": 6241 + }, + { + "epoch": 0.6854820997144739, + "grad_norm": 2.109579563140869, + "learning_rate": 1e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.7066262364387512, + "num_tokens": 158204573.0, + "step": 6242 + }, + { + "epoch": 0.6855919174170876, + "grad_norm": 1.943522334098816, + "learning_rate": 1e-06, + "loss": 1.0059, + "mean_token_accuracy": 0.6926445960998535, + "num_tokens": 158235970.0, + "step": 6243 + }, + { + "epoch": 0.6857017351197013, + "grad_norm": 2.206139087677002, + "learning_rate": 1e-06, + "loss": 0.886, + "mean_token_accuracy": 0.7277503609657288, + "num_tokens": 158259975.0, + "step": 6244 + }, + { + "epoch": 0.685811552822315, + "grad_norm": 2.2654898166656494, + "learning_rate": 1e-06, + "loss": 1.0409, + "mean_token_accuracy": 0.682620108127594, + "num_tokens": 158284075.0, + "step": 6245 + }, + { + "epoch": 0.6859213705249286, + "grad_norm": 2.116032361984253, + "learning_rate": 1e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.7086336612701416, + "num_tokens": 158314114.0, + "step": 6246 + }, + { + "epoch": 0.6860311882275423, + "grad_norm": 2.382559061050415, + "learning_rate": 1e-06, + "loss": 0.9054, + "mean_token_accuracy": 0.7212778925895691, + "num_tokens": 158335794.0, + "step": 6247 + }, + { + "epoch": 0.6861410059301559, + "grad_norm": 2.1193275451660156, + "learning_rate": 1e-06, + "loss": 0.8897, + "mean_token_accuracy": 0.7317808866500854, + "num_tokens": 158360169.0, + "step": 6248 + }, + { + "epoch": 0.6862508236327696, + "grad_norm": 2.035296678543091, + "learning_rate": 1e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.7038679122924805, + "num_tokens": 158387491.0, + "step": 6249 + }, + { + "epoch": 0.6863606413353832, + "grad_norm": 2.1543614864349365, + "learning_rate": 1e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.7119647860527039, + "num_tokens": 158411871.0, + "step": 6250 + }, + { + "epoch": 0.686470459037997, + "grad_norm": 1.9547439813613892, + "learning_rate": 1e-06, + "loss": 1.059, + "mean_token_accuracy": 0.679185152053833, + "num_tokens": 158445946.0, + "step": 6251 + }, + { + "epoch": 0.6865802767406106, + "grad_norm": 2.0374677181243896, + "learning_rate": 1e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.6957399845123291, + "num_tokens": 158475251.0, + "step": 6252 + }, + { + "epoch": 0.6866900944432243, + "grad_norm": 2.2253501415252686, + "learning_rate": 1e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.7022788524627686, + "num_tokens": 158499619.0, + "step": 6253 + }, + { + "epoch": 0.6867999121458379, + "grad_norm": 2.223237991333008, + "learning_rate": 1e-06, + "loss": 1.0549, + "mean_token_accuracy": 0.6781241297721863, + "num_tokens": 158526578.0, + "step": 6254 + }, + { + "epoch": 0.6869097298484516, + "grad_norm": 2.2475812435150146, + "learning_rate": 1e-06, + "loss": 0.9857, + "mean_token_accuracy": 0.6976771950721741, + "num_tokens": 158553436.0, + "step": 6255 + }, + { + "epoch": 0.6870195475510652, + "grad_norm": 2.2981791496276855, + "learning_rate": 1e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.7219123840332031, + "num_tokens": 158575894.0, + "step": 6256 + }, + { + "epoch": 0.6871293652536788, + "grad_norm": 2.1278486251831055, + "learning_rate": 1e-06, + "loss": 1.0144, + "mean_token_accuracy": 0.6927163600921631, + "num_tokens": 158604242.0, + "step": 6257 + }, + { + "epoch": 0.6872391829562926, + "grad_norm": 1.9904422760009766, + "learning_rate": 1e-06, + "loss": 1.0171, + "mean_token_accuracy": 0.6858114004135132, + "num_tokens": 158634127.0, + "step": 6258 + }, + { + "epoch": 0.6873490006589063, + "grad_norm": 1.9612046480178833, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.6955226063728333, + "num_tokens": 158664852.0, + "step": 6259 + }, + { + "epoch": 0.6874588183615199, + "grad_norm": 2.0982229709625244, + "learning_rate": 1e-06, + "loss": 0.8969, + "mean_token_accuracy": 0.7321090698242188, + "num_tokens": 158690819.0, + "step": 6260 + }, + { + "epoch": 0.6875686360641335, + "grad_norm": 2.0965967178344727, + "learning_rate": 1e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.6917741298675537, + "num_tokens": 158719304.0, + "step": 6261 + }, + { + "epoch": 0.6876784537667472, + "grad_norm": 2.3490889072418213, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.712417483329773, + "num_tokens": 158742001.0, + "step": 6262 + }, + { + "epoch": 0.6877882714693608, + "grad_norm": 2.122030258178711, + "learning_rate": 1e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.7150592803955078, + "num_tokens": 158768127.0, + "step": 6263 + }, + { + "epoch": 0.6878980891719745, + "grad_norm": 2.867938995361328, + "learning_rate": 1e-06, + "loss": 0.8278, + "mean_token_accuracy": 0.7389485836029053, + "num_tokens": 158784741.0, + "step": 6264 + }, + { + "epoch": 0.6880079068745882, + "grad_norm": 2.4228265285491943, + "learning_rate": 1e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.7090144157409668, + "num_tokens": 158805551.0, + "step": 6265 + }, + { + "epoch": 0.6881177245772019, + "grad_norm": 1.9690579175949097, + "learning_rate": 1e-06, + "loss": 1.0459, + "mean_token_accuracy": 0.6938087344169617, + "num_tokens": 158837092.0, + "step": 6266 + }, + { + "epoch": 0.6882275422798155, + "grad_norm": 2.163980007171631, + "learning_rate": 1e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.7244011163711548, + "num_tokens": 158861827.0, + "step": 6267 + }, + { + "epoch": 0.6883373599824292, + "grad_norm": 2.2308645248413086, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7083244323730469, + "num_tokens": 158887550.0, + "step": 6268 + }, + { + "epoch": 0.6884471776850428, + "grad_norm": 2.2098610401153564, + "learning_rate": 1e-06, + "loss": 0.9882, + "mean_token_accuracy": 0.7022541165351868, + "num_tokens": 158912514.0, + "step": 6269 + }, + { + "epoch": 0.6885569953876565, + "grad_norm": 2.2752678394317627, + "learning_rate": 1e-06, + "loss": 1.0306, + "mean_token_accuracy": 0.6879830956459045, + "num_tokens": 158936872.0, + "step": 6270 + }, + { + "epoch": 0.6886668130902701, + "grad_norm": 2.344243049621582, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7166727781295776, + "num_tokens": 158959192.0, + "step": 6271 + }, + { + "epoch": 0.6887766307928838, + "grad_norm": 2.2872979640960693, + "learning_rate": 1e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7227323651313782, + "num_tokens": 158981931.0, + "step": 6272 + }, + { + "epoch": 0.6888864484954975, + "grad_norm": 1.908769130706787, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.6937434077262878, + "num_tokens": 159013191.0, + "step": 6273 + }, + { + "epoch": 0.6889962661981112, + "grad_norm": 2.2243809700012207, + "learning_rate": 1e-06, + "loss": 1.0041, + "mean_token_accuracy": 0.7013390064239502, + "num_tokens": 159037677.0, + "step": 6274 + }, + { + "epoch": 0.6891060839007248, + "grad_norm": 2.3216283321380615, + "learning_rate": 1e-06, + "loss": 0.869, + "mean_token_accuracy": 0.7297857999801636, + "num_tokens": 159061629.0, + "step": 6275 + }, + { + "epoch": 0.6892159016033385, + "grad_norm": 1.938098430633545, + "learning_rate": 1e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.7190076112747192, + "num_tokens": 159090960.0, + "step": 6276 + }, + { + "epoch": 0.6893257193059521, + "grad_norm": 2.07736873626709, + "learning_rate": 1e-06, + "loss": 1.002, + "mean_token_accuracy": 0.6932022571563721, + "num_tokens": 159119364.0, + "step": 6277 + }, + { + "epoch": 0.6894355370085657, + "grad_norm": 2.052788496017456, + "learning_rate": 1e-06, + "loss": 0.973, + "mean_token_accuracy": 0.7054445147514343, + "num_tokens": 159148610.0, + "step": 6278 + }, + { + "epoch": 0.6895453547111794, + "grad_norm": 2.0340402126312256, + "learning_rate": 1e-06, + "loss": 0.9873, + "mean_token_accuracy": 0.7002687454223633, + "num_tokens": 159179339.0, + "step": 6279 + }, + { + "epoch": 0.6896551724137931, + "grad_norm": 2.317467451095581, + "learning_rate": 1e-06, + "loss": 0.9473, + "mean_token_accuracy": 0.7153773903846741, + "num_tokens": 159201391.0, + "step": 6280 + }, + { + "epoch": 0.6897649901164068, + "grad_norm": 2.42192006111145, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.7074386477470398, + "num_tokens": 159223745.0, + "step": 6281 + }, + { + "epoch": 0.6898748078190204, + "grad_norm": 2.113922595977783, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.705973744392395, + "num_tokens": 159248918.0, + "step": 6282 + }, + { + "epoch": 0.6899846255216341, + "grad_norm": 1.7805505990982056, + "learning_rate": 1e-06, + "loss": 1.0566, + "mean_token_accuracy": 0.6830464601516724, + "num_tokens": 159288158.0, + "step": 6283 + }, + { + "epoch": 0.6900944432242477, + "grad_norm": 2.4796078205108643, + "learning_rate": 1e-06, + "loss": 0.849, + "mean_token_accuracy": 0.7294270992279053, + "num_tokens": 159309103.0, + "step": 6284 + }, + { + "epoch": 0.6902042609268614, + "grad_norm": 2.487553358078003, + "learning_rate": 1e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.7252597808837891, + "num_tokens": 159329391.0, + "step": 6285 + }, + { + "epoch": 0.690314078629475, + "grad_norm": 2.212341070175171, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7099913358688354, + "num_tokens": 159352827.0, + "step": 6286 + }, + { + "epoch": 0.6904238963320888, + "grad_norm": 2.0995025634765625, + "learning_rate": 1e-06, + "loss": 0.8804, + "mean_token_accuracy": 0.724469780921936, + "num_tokens": 159380510.0, + "step": 6287 + }, + { + "epoch": 0.6905337140347024, + "grad_norm": 2.2819764614105225, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.7100838422775269, + "num_tokens": 159402386.0, + "step": 6288 + }, + { + "epoch": 0.6906435317373161, + "grad_norm": 2.328341007232666, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7122107744216919, + "num_tokens": 159426986.0, + "step": 6289 + }, + { + "epoch": 0.6907533494399297, + "grad_norm": 1.8376479148864746, + "learning_rate": 1e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.7126060724258423, + "num_tokens": 159459312.0, + "step": 6290 + }, + { + "epoch": 0.6908631671425434, + "grad_norm": 2.139249324798584, + "learning_rate": 1e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.7057386636734009, + "num_tokens": 159487913.0, + "step": 6291 + }, + { + "epoch": 0.690972984845157, + "grad_norm": 1.994547724723816, + "learning_rate": 1e-06, + "loss": 1.0291, + "mean_token_accuracy": 0.686690092086792, + "num_tokens": 159519405.0, + "step": 6292 + }, + { + "epoch": 0.6910828025477707, + "grad_norm": 2.26883864402771, + "learning_rate": 1e-06, + "loss": 0.887, + "mean_token_accuracy": 0.728904664516449, + "num_tokens": 159542675.0, + "step": 6293 + }, + { + "epoch": 0.6911926202503844, + "grad_norm": 2.2768373489379883, + "learning_rate": 1e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.7168816328048706, + "num_tokens": 159567776.0, + "step": 6294 + }, + { + "epoch": 0.6913024379529981, + "grad_norm": 2.3677666187286377, + "learning_rate": 1e-06, + "loss": 0.9073, + "mean_token_accuracy": 0.7203156352043152, + "num_tokens": 159588675.0, + "step": 6295 + }, + { + "epoch": 0.6914122556556117, + "grad_norm": 2.2614176273345947, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.708983302116394, + "num_tokens": 159614530.0, + "step": 6296 + }, + { + "epoch": 0.6915220733582254, + "grad_norm": 2.332850217819214, + "learning_rate": 1e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7055438756942749, + "num_tokens": 159637399.0, + "step": 6297 + }, + { + "epoch": 0.691631891060839, + "grad_norm": 2.253124475479126, + "learning_rate": 1e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.7132558822631836, + "num_tokens": 159662079.0, + "step": 6298 + }, + { + "epoch": 0.6917417087634526, + "grad_norm": 2.088639497756958, + "learning_rate": 1e-06, + "loss": 0.8263, + "mean_token_accuracy": 0.7386224269866943, + "num_tokens": 159688660.0, + "step": 6299 + }, + { + "epoch": 0.6918515264660663, + "grad_norm": 2.2115299701690674, + "learning_rate": 1e-06, + "loss": 0.9767, + "mean_token_accuracy": 0.6977335214614868, + "num_tokens": 159714490.0, + "step": 6300 + }, + { + "epoch": 0.6919613441686799, + "grad_norm": 1.7331513166427612, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7149010300636292, + "num_tokens": 159753266.0, + "step": 6301 + }, + { + "epoch": 0.6920711618712937, + "grad_norm": 2.546537399291992, + "learning_rate": 1e-06, + "loss": 0.8713, + "mean_token_accuracy": 0.73527991771698, + "num_tokens": 159773954.0, + "step": 6302 + }, + { + "epoch": 0.6921809795739073, + "grad_norm": 2.231661081314087, + "learning_rate": 1e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.718055009841919, + "num_tokens": 159798565.0, + "step": 6303 + }, + { + "epoch": 0.692290797276521, + "grad_norm": 1.8478456735610962, + "learning_rate": 1e-06, + "loss": 1.0154, + "mean_token_accuracy": 0.6986678838729858, + "num_tokens": 159831126.0, + "step": 6304 + }, + { + "epoch": 0.6924006149791346, + "grad_norm": 1.9644501209259033, + "learning_rate": 1e-06, + "loss": 0.9236, + "mean_token_accuracy": 0.7230265736579895, + "num_tokens": 159859614.0, + "step": 6305 + }, + { + "epoch": 0.6925104326817483, + "grad_norm": 2.1937921047210693, + "learning_rate": 1e-06, + "loss": 0.8983, + "mean_token_accuracy": 0.7206945419311523, + "num_tokens": 159885400.0, + "step": 6306 + }, + { + "epoch": 0.6926202503843619, + "grad_norm": 2.2271547317504883, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7164360880851746, + "num_tokens": 159911721.0, + "step": 6307 + }, + { + "epoch": 0.6927300680869756, + "grad_norm": 2.403268337249756, + "learning_rate": 1e-06, + "loss": 0.9086, + "mean_token_accuracy": 0.7111694812774658, + "num_tokens": 159934193.0, + "step": 6308 + }, + { + "epoch": 0.6928398857895893, + "grad_norm": 2.523801326751709, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7057809829711914, + "num_tokens": 159959785.0, + "step": 6309 + }, + { + "epoch": 0.692949703492203, + "grad_norm": 2.4416863918304443, + "learning_rate": 1e-06, + "loss": 1.0438, + "mean_token_accuracy": 0.682730495929718, + "num_tokens": 159983654.0, + "step": 6310 + }, + { + "epoch": 0.6930595211948166, + "grad_norm": 2.5187060832977295, + "learning_rate": 1e-06, + "loss": 0.8897, + "mean_token_accuracy": 0.7245076894760132, + "num_tokens": 160002512.0, + "step": 6311 + }, + { + "epoch": 0.6931693388974303, + "grad_norm": 2.291456937789917, + "learning_rate": 1e-06, + "loss": 1.0963, + "mean_token_accuracy": 0.6817235946655273, + "num_tokens": 160030415.0, + "step": 6312 + }, + { + "epoch": 0.6932791566000439, + "grad_norm": 2.2405853271484375, + "learning_rate": 1e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7071855664253235, + "num_tokens": 160054125.0, + "step": 6313 + }, + { + "epoch": 0.6933889743026576, + "grad_norm": 2.2932019233703613, + "learning_rate": 1e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.7064781188964844, + "num_tokens": 160078870.0, + "step": 6314 + }, + { + "epoch": 0.6934987920052712, + "grad_norm": 2.189791679382324, + "learning_rate": 1e-06, + "loss": 0.987, + "mean_token_accuracy": 0.6962211728096008, + "num_tokens": 160103965.0, + "step": 6315 + }, + { + "epoch": 0.693608609707885, + "grad_norm": 2.1856675148010254, + "learning_rate": 1e-06, + "loss": 0.9969, + "mean_token_accuracy": 0.699579656124115, + "num_tokens": 160130318.0, + "step": 6316 + }, + { + "epoch": 0.6937184274104986, + "grad_norm": 2.224677801132202, + "learning_rate": 1e-06, + "loss": 1.0323, + "mean_token_accuracy": 0.6873725056648254, + "num_tokens": 160154622.0, + "step": 6317 + }, + { + "epoch": 0.6938282451131123, + "grad_norm": 2.123807430267334, + "learning_rate": 1e-06, + "loss": 1.0019, + "mean_token_accuracy": 0.7009844183921814, + "num_tokens": 160181021.0, + "step": 6318 + }, + { + "epoch": 0.6939380628157259, + "grad_norm": 2.052048444747925, + "learning_rate": 1e-06, + "loss": 1.0123, + "mean_token_accuracy": 0.6907047033309937, + "num_tokens": 160211589.0, + "step": 6319 + }, + { + "epoch": 0.6940478805183395, + "grad_norm": 2.324385643005371, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.703161358833313, + "num_tokens": 160235489.0, + "step": 6320 + }, + { + "epoch": 0.6941576982209532, + "grad_norm": 2.247089147567749, + "learning_rate": 1e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.7082785367965698, + "num_tokens": 160260185.0, + "step": 6321 + }, + { + "epoch": 0.6942675159235668, + "grad_norm": 2.2140755653381348, + "learning_rate": 1e-06, + "loss": 0.9453, + "mean_token_accuracy": 0.7052781581878662, + "num_tokens": 160285782.0, + "step": 6322 + }, + { + "epoch": 0.6943773336261806, + "grad_norm": 2.1731951236724854, + "learning_rate": 1e-06, + "loss": 0.8366, + "mean_token_accuracy": 0.7321088910102844, + "num_tokens": 160308972.0, + "step": 6323 + }, + { + "epoch": 0.6944871513287942, + "grad_norm": 2.4046285152435303, + "learning_rate": 1e-06, + "loss": 0.9224, + "mean_token_accuracy": 0.7156133651733398, + "num_tokens": 160331149.0, + "step": 6324 + }, + { + "epoch": 0.6945969690314079, + "grad_norm": 2.1258881092071533, + "learning_rate": 1e-06, + "loss": 0.843, + "mean_token_accuracy": 0.7403218746185303, + "num_tokens": 160357343.0, + "step": 6325 + }, + { + "epoch": 0.6947067867340215, + "grad_norm": 2.3410768508911133, + "learning_rate": 1e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.7102705836296082, + "num_tokens": 160381147.0, + "step": 6326 + }, + { + "epoch": 0.6948166044366352, + "grad_norm": 2.180464506149292, + "learning_rate": 1e-06, + "loss": 1.0003, + "mean_token_accuracy": 0.7015251517295837, + "num_tokens": 160407214.0, + "step": 6327 + }, + { + "epoch": 0.6949264221392488, + "grad_norm": 2.2944297790527344, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.712655782699585, + "num_tokens": 160431526.0, + "step": 6328 + }, + { + "epoch": 0.6950362398418625, + "grad_norm": 2.1307921409606934, + "learning_rate": 1e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.6946518421173096, + "num_tokens": 160458549.0, + "step": 6329 + }, + { + "epoch": 0.6951460575444761, + "grad_norm": 2.0189719200134277, + "learning_rate": 1e-06, + "loss": 0.9939, + "mean_token_accuracy": 0.6949666738510132, + "num_tokens": 160486611.0, + "step": 6330 + }, + { + "epoch": 0.6952558752470899, + "grad_norm": 2.0442023277282715, + "learning_rate": 1e-06, + "loss": 1.1322, + "mean_token_accuracy": 0.6659866571426392, + "num_tokens": 160517981.0, + "step": 6331 + }, + { + "epoch": 0.6953656929497035, + "grad_norm": 2.2071778774261475, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.7056006193161011, + "num_tokens": 160543221.0, + "step": 6332 + }, + { + "epoch": 0.6954755106523172, + "grad_norm": 2.081331253051758, + "learning_rate": 1e-06, + "loss": 1.0382, + "mean_token_accuracy": 0.6874446272850037, + "num_tokens": 160572056.0, + "step": 6333 + }, + { + "epoch": 0.6955853283549308, + "grad_norm": 2.119140863418579, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7142292261123657, + "num_tokens": 160599681.0, + "step": 6334 + }, + { + "epoch": 0.6956951460575445, + "grad_norm": 2.2154972553253174, + "learning_rate": 1e-06, + "loss": 1.0376, + "mean_token_accuracy": 0.6903655529022217, + "num_tokens": 160626627.0, + "step": 6335 + }, + { + "epoch": 0.6958049637601581, + "grad_norm": 2.2333431243896484, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7188851833343506, + "num_tokens": 160649519.0, + "step": 6336 + }, + { + "epoch": 0.6959147814627717, + "grad_norm": 2.1645052433013916, + "learning_rate": 1e-06, + "loss": 1.0628, + "mean_token_accuracy": 0.6854922771453857, + "num_tokens": 160677504.0, + "step": 6337 + }, + { + "epoch": 0.6960245991653855, + "grad_norm": 2.358879804611206, + "learning_rate": 1e-06, + "loss": 0.8965, + "mean_token_accuracy": 0.7259065508842468, + "num_tokens": 160699631.0, + "step": 6338 + }, + { + "epoch": 0.6961344168679992, + "grad_norm": 2.264895439147949, + "learning_rate": 1e-06, + "loss": 0.8494, + "mean_token_accuracy": 0.7291773557662964, + "num_tokens": 160720878.0, + "step": 6339 + }, + { + "epoch": 0.6962442345706128, + "grad_norm": 2.1260533332824707, + "learning_rate": 1e-06, + "loss": 1.0716, + "mean_token_accuracy": 0.676463782787323, + "num_tokens": 160750062.0, + "step": 6340 + }, + { + "epoch": 0.6963540522732264, + "grad_norm": 1.9867531061172485, + "learning_rate": 1e-06, + "loss": 0.961, + "mean_token_accuracy": 0.720391035079956, + "num_tokens": 160779357.0, + "step": 6341 + }, + { + "epoch": 0.6964638699758401, + "grad_norm": 2.215054988861084, + "learning_rate": 1e-06, + "loss": 1.02, + "mean_token_accuracy": 0.6938520073890686, + "num_tokens": 160803719.0, + "step": 6342 + }, + { + "epoch": 0.6965736876784537, + "grad_norm": 2.203845262527466, + "learning_rate": 1e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.712154746055603, + "num_tokens": 160828258.0, + "step": 6343 + }, + { + "epoch": 0.6966835053810674, + "grad_norm": 2.587613582611084, + "learning_rate": 1e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7256700992584229, + "num_tokens": 160847097.0, + "step": 6344 + }, + { + "epoch": 0.6967933230836811, + "grad_norm": 2.1202008724212646, + "learning_rate": 1e-06, + "loss": 1.0051, + "mean_token_accuracy": 0.697732150554657, + "num_tokens": 160873820.0, + "step": 6345 + }, + { + "epoch": 0.6969031407862948, + "grad_norm": 2.5469682216644287, + "learning_rate": 1e-06, + "loss": 0.9115, + "mean_token_accuracy": 0.7161226272583008, + "num_tokens": 160893192.0, + "step": 6346 + }, + { + "epoch": 0.6970129584889084, + "grad_norm": 2.2845964431762695, + "learning_rate": 1e-06, + "loss": 1.0374, + "mean_token_accuracy": 0.6821848154067993, + "num_tokens": 160918261.0, + "step": 6347 + }, + { + "epoch": 0.6971227761915221, + "grad_norm": 2.2771801948547363, + "learning_rate": 1e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.7056275606155396, + "num_tokens": 160943936.0, + "step": 6348 + }, + { + "epoch": 0.6972325938941357, + "grad_norm": 2.242387294769287, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7279256582260132, + "num_tokens": 160969226.0, + "step": 6349 + }, + { + "epoch": 0.6973424115967494, + "grad_norm": 2.390554428100586, + "learning_rate": 1e-06, + "loss": 0.9627, + "mean_token_accuracy": 0.6975772380828857, + "num_tokens": 160991068.0, + "step": 6350 + }, + { + "epoch": 0.697452229299363, + "grad_norm": 2.211676597595215, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.7105202674865723, + "num_tokens": 161015487.0, + "step": 6351 + }, + { + "epoch": 0.6975620470019768, + "grad_norm": 2.115215539932251, + "learning_rate": 1e-06, + "loss": 1.0187, + "mean_token_accuracy": 0.6918930411338806, + "num_tokens": 161043742.0, + "step": 6352 + }, + { + "epoch": 0.6976718647045904, + "grad_norm": 2.2139813899993896, + "learning_rate": 1e-06, + "loss": 0.9085, + "mean_token_accuracy": 0.7275102138519287, + "num_tokens": 161069164.0, + "step": 6353 + }, + { + "epoch": 0.6977816824072041, + "grad_norm": 2.468991279602051, + "learning_rate": 1e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.7090806365013123, + "num_tokens": 161089051.0, + "step": 6354 + }, + { + "epoch": 0.6978915001098177, + "grad_norm": 2.2333145141601562, + "learning_rate": 1e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.7005417346954346, + "num_tokens": 161116182.0, + "step": 6355 + }, + { + "epoch": 0.6980013178124314, + "grad_norm": 2.32772159576416, + "learning_rate": 1e-06, + "loss": 0.9034, + "mean_token_accuracy": 0.7181970477104187, + "num_tokens": 161138957.0, + "step": 6356 + }, + { + "epoch": 0.698111135515045, + "grad_norm": 2.2368531227111816, + "learning_rate": 1e-06, + "loss": 0.9346, + "mean_token_accuracy": 0.7173300981521606, + "num_tokens": 161163349.0, + "step": 6357 + }, + { + "epoch": 0.6982209532176586, + "grad_norm": 2.1552441120147705, + "learning_rate": 1e-06, + "loss": 0.9924, + "mean_token_accuracy": 0.703320324420929, + "num_tokens": 161191350.0, + "step": 6358 + }, + { + "epoch": 0.6983307709202724, + "grad_norm": 2.276740550994873, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.6940494179725647, + "num_tokens": 161216465.0, + "step": 6359 + }, + { + "epoch": 0.698440588622886, + "grad_norm": 2.2484076023101807, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7043802738189697, + "num_tokens": 161241356.0, + "step": 6360 + }, + { + "epoch": 0.6985504063254997, + "grad_norm": 2.4060757160186768, + "learning_rate": 1e-06, + "loss": 0.946, + "mean_token_accuracy": 0.7187796235084534, + "num_tokens": 161263896.0, + "step": 6361 + }, + { + "epoch": 0.6986602240281133, + "grad_norm": 2.123430013656616, + "learning_rate": 1e-06, + "loss": 0.9341, + "mean_token_accuracy": 0.7129563093185425, + "num_tokens": 161292090.0, + "step": 6362 + }, + { + "epoch": 0.698770041730727, + "grad_norm": 2.2811293601989746, + "learning_rate": 1e-06, + "loss": 0.9323, + "mean_token_accuracy": 0.7216496467590332, + "num_tokens": 161316574.0, + "step": 6363 + }, + { + "epoch": 0.6988798594333406, + "grad_norm": 2.143202304840088, + "learning_rate": 1e-06, + "loss": 1.0024, + "mean_token_accuracy": 0.6944724321365356, + "num_tokens": 161343965.0, + "step": 6364 + }, + { + "epoch": 0.6989896771359543, + "grad_norm": 2.128554105758667, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.709530770778656, + "num_tokens": 161368453.0, + "step": 6365 + }, + { + "epoch": 0.6990994948385679, + "grad_norm": 2.4667866230010986, + "learning_rate": 1e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.7163282632827759, + "num_tokens": 161389652.0, + "step": 6366 + }, + { + "epoch": 0.6992093125411817, + "grad_norm": 2.3322582244873047, + "learning_rate": 1e-06, + "loss": 1.0386, + "mean_token_accuracy": 0.6981323957443237, + "num_tokens": 161413162.0, + "step": 6367 + }, + { + "epoch": 0.6993191302437953, + "grad_norm": 2.192262649536133, + "learning_rate": 1e-06, + "loss": 0.926, + "mean_token_accuracy": 0.7239341735839844, + "num_tokens": 161436900.0, + "step": 6368 + }, + { + "epoch": 0.699428947946409, + "grad_norm": 2.1954598426818848, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7158313393592834, + "num_tokens": 161464002.0, + "step": 6369 + }, + { + "epoch": 0.6995387656490226, + "grad_norm": 2.1853694915771484, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7132605314254761, + "num_tokens": 161490488.0, + "step": 6370 + }, + { + "epoch": 0.6996485833516363, + "grad_norm": 1.9688584804534912, + "learning_rate": 1e-06, + "loss": 1.0402, + "mean_token_accuracy": 0.681427001953125, + "num_tokens": 161522505.0, + "step": 6371 + }, + { + "epoch": 0.6997584010542499, + "grad_norm": 2.392394542694092, + "learning_rate": 1e-06, + "loss": 0.9025, + "mean_token_accuracy": 0.7203227281570435, + "num_tokens": 161544946.0, + "step": 6372 + }, + { + "epoch": 0.6998682187568636, + "grad_norm": 2.7950551509857178, + "learning_rate": 1e-06, + "loss": 0.8758, + "mean_token_accuracy": 0.7342970371246338, + "num_tokens": 161562181.0, + "step": 6373 + }, + { + "epoch": 0.6999780364594773, + "grad_norm": 2.1577486991882324, + "learning_rate": 1e-06, + "loss": 0.9529, + "mean_token_accuracy": 0.7119287252426147, + "num_tokens": 161586869.0, + "step": 6374 + }, + { + "epoch": 0.700087854162091, + "grad_norm": 2.076845407485962, + "learning_rate": 1e-06, + "loss": 1.044, + "mean_token_accuracy": 0.7007747888565063, + "num_tokens": 161612998.0, + "step": 6375 + }, + { + "epoch": 0.7001976718647046, + "grad_norm": 2.573530435562134, + "learning_rate": 1e-06, + "loss": 0.9133, + "mean_token_accuracy": 0.7137582898139954, + "num_tokens": 161632764.0, + "step": 6376 + }, + { + "epoch": 0.7003074895673183, + "grad_norm": 2.723066806793213, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7114803194999695, + "num_tokens": 161650764.0, + "step": 6377 + }, + { + "epoch": 0.7004173072699319, + "grad_norm": 2.2480099201202393, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.7132754325866699, + "num_tokens": 161674199.0, + "step": 6378 + }, + { + "epoch": 0.7005271249725455, + "grad_norm": 1.9857085943222046, + "learning_rate": 1e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.7067040801048279, + "num_tokens": 161707162.0, + "step": 6379 + }, + { + "epoch": 0.7006369426751592, + "grad_norm": 2.2003021240234375, + "learning_rate": 1e-06, + "loss": 1.0206, + "mean_token_accuracy": 0.6894239783287048, + "num_tokens": 161734119.0, + "step": 6380 + }, + { + "epoch": 0.700746760377773, + "grad_norm": 2.097731590270996, + "learning_rate": 1e-06, + "loss": 0.8297, + "mean_token_accuracy": 0.7508107423782349, + "num_tokens": 161762775.0, + "step": 6381 + }, + { + "epoch": 0.7008565780803866, + "grad_norm": 2.401582956314087, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7095999717712402, + "num_tokens": 161784963.0, + "step": 6382 + }, + { + "epoch": 0.7009663957830002, + "grad_norm": 1.991689682006836, + "learning_rate": 1e-06, + "loss": 1.0064, + "mean_token_accuracy": 0.6937307119369507, + "num_tokens": 161814538.0, + "step": 6383 + }, + { + "epoch": 0.7010762134856139, + "grad_norm": 2.10959529876709, + "learning_rate": 1e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.7020467519760132, + "num_tokens": 161842434.0, + "step": 6384 + }, + { + "epoch": 0.7011860311882275, + "grad_norm": 2.266685962677002, + "learning_rate": 1e-06, + "loss": 1.0227, + "mean_token_accuracy": 0.6930860877037048, + "num_tokens": 161870336.0, + "step": 6385 + }, + { + "epoch": 0.7012958488908412, + "grad_norm": 2.062903881072998, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.701930046081543, + "num_tokens": 161897100.0, + "step": 6386 + }, + { + "epoch": 0.7014056665934548, + "grad_norm": 2.1867809295654297, + "learning_rate": 1e-06, + "loss": 0.8977, + "mean_token_accuracy": 0.7129312753677368, + "num_tokens": 161921248.0, + "step": 6387 + }, + { + "epoch": 0.7015154842960686, + "grad_norm": 2.1320269107818604, + "learning_rate": 1e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.7054650187492371, + "num_tokens": 161946810.0, + "step": 6388 + }, + { + "epoch": 0.7016253019986822, + "grad_norm": 2.412217617034912, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.7153760194778442, + "num_tokens": 161968728.0, + "step": 6389 + }, + { + "epoch": 0.7017351197012959, + "grad_norm": 2.3447375297546387, + "learning_rate": 1e-06, + "loss": 0.8306, + "mean_token_accuracy": 0.7383447885513306, + "num_tokens": 161991383.0, + "step": 6390 + }, + { + "epoch": 0.7018449374039095, + "grad_norm": 2.0120584964752197, + "learning_rate": 1e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7237898707389832, + "num_tokens": 162020329.0, + "step": 6391 + }, + { + "epoch": 0.7019547551065232, + "grad_norm": 2.0068576335906982, + "learning_rate": 1e-06, + "loss": 1.0031, + "mean_token_accuracy": 0.6936607956886292, + "num_tokens": 162051054.0, + "step": 6392 + }, + { + "epoch": 0.7020645728091368, + "grad_norm": 2.1555306911468506, + "learning_rate": 1e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.7079490423202515, + "num_tokens": 162077384.0, + "step": 6393 + }, + { + "epoch": 0.7021743905117505, + "grad_norm": 2.130558490753174, + "learning_rate": 1e-06, + "loss": 0.9779, + "mean_token_accuracy": 0.6996238231658936, + "num_tokens": 162105350.0, + "step": 6394 + }, + { + "epoch": 0.7022842082143641, + "grad_norm": 2.1129038333892822, + "learning_rate": 1e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.7019438743591309, + "num_tokens": 162132087.0, + "step": 6395 + }, + { + "epoch": 0.7023940259169779, + "grad_norm": 2.1734118461608887, + "learning_rate": 1e-06, + "loss": 1.0282, + "mean_token_accuracy": 0.6954265832901001, + "num_tokens": 162158985.0, + "step": 6396 + }, + { + "epoch": 0.7025038436195915, + "grad_norm": 2.0742483139038086, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7087043523788452, + "num_tokens": 162185851.0, + "step": 6397 + }, + { + "epoch": 0.7026136613222052, + "grad_norm": 1.891394853591919, + "learning_rate": 1e-06, + "loss": 0.9997, + "mean_token_accuracy": 0.698667049407959, + "num_tokens": 162219867.0, + "step": 6398 + }, + { + "epoch": 0.7027234790248188, + "grad_norm": 2.2778093814849854, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7083570957183838, + "num_tokens": 162243396.0, + "step": 6399 + }, + { + "epoch": 0.7028332967274324, + "grad_norm": 2.0098748207092285, + "learning_rate": 1e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.715496838092804, + "num_tokens": 162271924.0, + "step": 6400 + }, + { + "epoch": 0.7029431144300461, + "grad_norm": 2.3390817642211914, + "learning_rate": 1e-06, + "loss": 1.0272, + "mean_token_accuracy": 0.6890175342559814, + "num_tokens": 162294996.0, + "step": 6401 + }, + { + "epoch": 0.7030529321326597, + "grad_norm": 2.066190481185913, + "learning_rate": 1e-06, + "loss": 0.9856, + "mean_token_accuracy": 0.7082571387290955, + "num_tokens": 162322496.0, + "step": 6402 + }, + { + "epoch": 0.7031627498352735, + "grad_norm": 2.718095064163208, + "learning_rate": 1e-06, + "loss": 0.8942, + "mean_token_accuracy": 0.7241882085800171, + "num_tokens": 162339715.0, + "step": 6403 + }, + { + "epoch": 0.7032725675378871, + "grad_norm": 2.2798550128936768, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.6954823136329651, + "num_tokens": 162365432.0, + "step": 6404 + }, + { + "epoch": 0.7033823852405008, + "grad_norm": 2.1988327503204346, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7153152823448181, + "num_tokens": 162390256.0, + "step": 6405 + }, + { + "epoch": 0.7034922029431144, + "grad_norm": 2.4871268272399902, + "learning_rate": 1e-06, + "loss": 0.9274, + "mean_token_accuracy": 0.7094818353652954, + "num_tokens": 162409265.0, + "step": 6406 + }, + { + "epoch": 0.7036020206457281, + "grad_norm": 2.239140033721924, + "learning_rate": 1e-06, + "loss": 0.9749, + "mean_token_accuracy": 0.7036513090133667, + "num_tokens": 162433077.0, + "step": 6407 + }, + { + "epoch": 0.7037118383483417, + "grad_norm": 2.3298838138580322, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7152245044708252, + "num_tokens": 162455149.0, + "step": 6408 + }, + { + "epoch": 0.7038216560509554, + "grad_norm": 2.1599302291870117, + "learning_rate": 1e-06, + "loss": 1.0347, + "mean_token_accuracy": 0.6911055445671082, + "num_tokens": 162481995.0, + "step": 6409 + }, + { + "epoch": 0.7039314737535691, + "grad_norm": 2.1194944381713867, + "learning_rate": 1e-06, + "loss": 0.9546, + "mean_token_accuracy": 0.7045225501060486, + "num_tokens": 162508307.0, + "step": 6410 + }, + { + "epoch": 0.7040412914561828, + "grad_norm": 2.4436612129211426, + "learning_rate": 1e-06, + "loss": 0.9588, + "mean_token_accuracy": 0.7120716571807861, + "num_tokens": 162530326.0, + "step": 6411 + }, + { + "epoch": 0.7041511091587964, + "grad_norm": 2.1430656909942627, + "learning_rate": 1e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7261815667152405, + "num_tokens": 162556280.0, + "step": 6412 + }, + { + "epoch": 0.7042609268614101, + "grad_norm": 2.2755157947540283, + "learning_rate": 1e-06, + "loss": 0.8574, + "mean_token_accuracy": 0.7345472574234009, + "num_tokens": 162578742.0, + "step": 6413 + }, + { + "epoch": 0.7043707445640237, + "grad_norm": 2.127643346786499, + "learning_rate": 1e-06, + "loss": 0.8939, + "mean_token_accuracy": 0.7185581922531128, + "num_tokens": 162603971.0, + "step": 6414 + }, + { + "epoch": 0.7044805622666374, + "grad_norm": 2.119739294052124, + "learning_rate": 1e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.7198801040649414, + "num_tokens": 162630345.0, + "step": 6415 + }, + { + "epoch": 0.704590379969251, + "grad_norm": 2.233276844024658, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.6980801224708557, + "num_tokens": 162654804.0, + "step": 6416 + }, + { + "epoch": 0.7047001976718648, + "grad_norm": 1.8234418630599976, + "learning_rate": 1e-06, + "loss": 1.1131, + "mean_token_accuracy": 0.6705334782600403, + "num_tokens": 162692385.0, + "step": 6417 + }, + { + "epoch": 0.7048100153744784, + "grad_norm": 2.360236406326294, + "learning_rate": 1e-06, + "loss": 0.9798, + "mean_token_accuracy": 0.704118549823761, + "num_tokens": 162714551.0, + "step": 6418 + }, + { + "epoch": 0.704919833077092, + "grad_norm": 2.3479597568511963, + "learning_rate": 1e-06, + "loss": 0.9195, + "mean_token_accuracy": 0.7159146070480347, + "num_tokens": 162735539.0, + "step": 6419 + }, + { + "epoch": 0.7050296507797057, + "grad_norm": 2.506774425506592, + "learning_rate": 1e-06, + "loss": 1.0314, + "mean_token_accuracy": 0.6860877275466919, + "num_tokens": 162758830.0, + "step": 6420 + }, + { + "epoch": 0.7051394684823193, + "grad_norm": 2.2664852142333984, + "learning_rate": 1e-06, + "loss": 0.9664, + "mean_token_accuracy": 0.7012754678726196, + "num_tokens": 162784225.0, + "step": 6421 + }, + { + "epoch": 0.705249286184933, + "grad_norm": 2.0446243286132812, + "learning_rate": 1e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.7189182043075562, + "num_tokens": 162813960.0, + "step": 6422 + }, + { + "epoch": 0.7053591038875466, + "grad_norm": 2.189474105834961, + "learning_rate": 1e-06, + "loss": 0.9664, + "mean_token_accuracy": 0.713104248046875, + "num_tokens": 162841007.0, + "step": 6423 + }, + { + "epoch": 0.7054689215901603, + "grad_norm": 2.446018934249878, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.7108927369117737, + "num_tokens": 162862646.0, + "step": 6424 + }, + { + "epoch": 0.705578739292774, + "grad_norm": 2.4889726638793945, + "learning_rate": 1e-06, + "loss": 0.9955, + "mean_token_accuracy": 0.6975609064102173, + "num_tokens": 162883387.0, + "step": 6425 + }, + { + "epoch": 0.7056885569953877, + "grad_norm": 2.3375749588012695, + "learning_rate": 1e-06, + "loss": 0.8854, + "mean_token_accuracy": 0.7201524376869202, + "num_tokens": 162904556.0, + "step": 6426 + }, + { + "epoch": 0.7057983746980013, + "grad_norm": 2.0807957649230957, + "learning_rate": 1e-06, + "loss": 1.0629, + "mean_token_accuracy": 0.679167628288269, + "num_tokens": 162934121.0, + "step": 6427 + }, + { + "epoch": 0.705908192400615, + "grad_norm": 2.0920932292938232, + "learning_rate": 1e-06, + "loss": 1.0135, + "mean_token_accuracy": 0.7040524482727051, + "num_tokens": 162963457.0, + "step": 6428 + }, + { + "epoch": 0.7060180101032286, + "grad_norm": 2.2028019428253174, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7032101154327393, + "num_tokens": 162987113.0, + "step": 6429 + }, + { + "epoch": 0.7061278278058423, + "grad_norm": 2.2730536460876465, + "learning_rate": 1e-06, + "loss": 0.8588, + "mean_token_accuracy": 0.73033607006073, + "num_tokens": 163010973.0, + "step": 6430 + }, + { + "epoch": 0.7062376455084559, + "grad_norm": 2.1096038818359375, + "learning_rate": 1e-06, + "loss": 0.9166, + "mean_token_accuracy": 0.7290518283843994, + "num_tokens": 163035867.0, + "step": 6431 + }, + { + "epoch": 0.7063474632110697, + "grad_norm": 2.123464822769165, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.7064067125320435, + "num_tokens": 163064431.0, + "step": 6432 + }, + { + "epoch": 0.7064572809136833, + "grad_norm": 2.2136240005493164, + "learning_rate": 1e-06, + "loss": 0.9722, + "mean_token_accuracy": 0.6998753547668457, + "num_tokens": 163088953.0, + "step": 6433 + }, + { + "epoch": 0.706567098616297, + "grad_norm": 2.0883631706237793, + "learning_rate": 1e-06, + "loss": 1.046, + "mean_token_accuracy": 0.7042814493179321, + "num_tokens": 163116491.0, + "step": 6434 + }, + { + "epoch": 0.7066769163189106, + "grad_norm": 2.3045177459716797, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7049060463905334, + "num_tokens": 163139068.0, + "step": 6435 + }, + { + "epoch": 0.7067867340215243, + "grad_norm": 1.9898630380630493, + "learning_rate": 1e-06, + "loss": 1.0255, + "mean_token_accuracy": 0.6855778694152832, + "num_tokens": 163171784.0, + "step": 6436 + }, + { + "epoch": 0.7068965517241379, + "grad_norm": 2.0100622177124023, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.707011342048645, + "num_tokens": 163202970.0, + "step": 6437 + }, + { + "epoch": 0.7070063694267515, + "grad_norm": 2.3984713554382324, + "learning_rate": 1e-06, + "loss": 1.0539, + "mean_token_accuracy": 0.6811879277229309, + "num_tokens": 163228044.0, + "step": 6438 + }, + { + "epoch": 0.7071161871293653, + "grad_norm": 2.224881410598755, + "learning_rate": 1e-06, + "loss": 1.0159, + "mean_token_accuracy": 0.6904494762420654, + "num_tokens": 163254098.0, + "step": 6439 + }, + { + "epoch": 0.707226004831979, + "grad_norm": 1.9151180982589722, + "learning_rate": 1e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.7077265977859497, + "num_tokens": 163284807.0, + "step": 6440 + }, + { + "epoch": 0.7073358225345926, + "grad_norm": 2.2788946628570557, + "learning_rate": 1e-06, + "loss": 0.8837, + "mean_token_accuracy": 0.7351803779602051, + "num_tokens": 163306441.0, + "step": 6441 + }, + { + "epoch": 0.7074456402372062, + "grad_norm": 2.097428321838379, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7125641107559204, + "num_tokens": 163332448.0, + "step": 6442 + }, + { + "epoch": 0.7075554579398199, + "grad_norm": 2.1982126235961914, + "learning_rate": 1e-06, + "loss": 1.0073, + "mean_token_accuracy": 0.7022596597671509, + "num_tokens": 163358881.0, + "step": 6443 + }, + { + "epoch": 0.7076652756424335, + "grad_norm": 2.127103567123413, + "learning_rate": 1e-06, + "loss": 0.99, + "mean_token_accuracy": 0.6975856423377991, + "num_tokens": 163385308.0, + "step": 6444 + }, + { + "epoch": 0.7077750933450472, + "grad_norm": 2.383404493331909, + "learning_rate": 1e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.6903241872787476, + "num_tokens": 163407359.0, + "step": 6445 + }, + { + "epoch": 0.7078849110476609, + "grad_norm": 2.382446050643921, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.6970481276512146, + "num_tokens": 163429549.0, + "step": 6446 + }, + { + "epoch": 0.7079947287502746, + "grad_norm": 1.9304362535476685, + "learning_rate": 1e-06, + "loss": 1.0641, + "mean_token_accuracy": 0.682607114315033, + "num_tokens": 163463387.0, + "step": 6447 + }, + { + "epoch": 0.7081045464528882, + "grad_norm": 1.9765963554382324, + "learning_rate": 1e-06, + "loss": 1.0534, + "mean_token_accuracy": 0.6841023564338684, + "num_tokens": 163496491.0, + "step": 6448 + }, + { + "epoch": 0.7082143641555019, + "grad_norm": 2.4184200763702393, + "learning_rate": 1e-06, + "loss": 0.7806, + "mean_token_accuracy": 0.7428082227706909, + "num_tokens": 163517716.0, + "step": 6449 + }, + { + "epoch": 0.7083241818581155, + "grad_norm": 2.1424577236175537, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.7082552313804626, + "num_tokens": 163542038.0, + "step": 6450 + }, + { + "epoch": 0.7084339995607292, + "grad_norm": 2.4285714626312256, + "learning_rate": 1e-06, + "loss": 0.9195, + "mean_token_accuracy": 0.7156493067741394, + "num_tokens": 163562501.0, + "step": 6451 + }, + { + "epoch": 0.7085438172633428, + "grad_norm": 2.077754497528076, + "learning_rate": 1e-06, + "loss": 1.0309, + "mean_token_accuracy": 0.7024094462394714, + "num_tokens": 163592428.0, + "step": 6452 + }, + { + "epoch": 0.7086536349659565, + "grad_norm": 2.3670647144317627, + "learning_rate": 1e-06, + "loss": 0.966, + "mean_token_accuracy": 0.7046120166778564, + "num_tokens": 163616656.0, + "step": 6453 + }, + { + "epoch": 0.7087634526685702, + "grad_norm": 2.648447275161743, + "learning_rate": 1e-06, + "loss": 0.9843, + "mean_token_accuracy": 0.708081841468811, + "num_tokens": 163636902.0, + "step": 6454 + }, + { + "epoch": 0.7088732703711839, + "grad_norm": 2.7552144527435303, + "learning_rate": 1e-06, + "loss": 0.8751, + "mean_token_accuracy": 0.729539155960083, + "num_tokens": 163654620.0, + "step": 6455 + }, + { + "epoch": 0.7089830880737975, + "grad_norm": 2.208637237548828, + "learning_rate": 1e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.7022234201431274, + "num_tokens": 163678971.0, + "step": 6456 + }, + { + "epoch": 0.7090929057764112, + "grad_norm": 2.187345266342163, + "learning_rate": 1e-06, + "loss": 0.8967, + "mean_token_accuracy": 0.7221149206161499, + "num_tokens": 163703951.0, + "step": 6457 + }, + { + "epoch": 0.7092027234790248, + "grad_norm": 2.084200143814087, + "learning_rate": 1e-06, + "loss": 0.9268, + "mean_token_accuracy": 0.7128418684005737, + "num_tokens": 163731808.0, + "step": 6458 + }, + { + "epoch": 0.7093125411816384, + "grad_norm": 2.184278726577759, + "learning_rate": 1e-06, + "loss": 0.9954, + "mean_token_accuracy": 0.7000194191932678, + "num_tokens": 163757905.0, + "step": 6459 + }, + { + "epoch": 0.7094223588842521, + "grad_norm": 2.2493066787719727, + "learning_rate": 1e-06, + "loss": 1.062, + "mean_token_accuracy": 0.6829351782798767, + "num_tokens": 163783608.0, + "step": 6460 + }, + { + "epoch": 0.7095321765868658, + "grad_norm": 2.1382858753204346, + "learning_rate": 1e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.7077344655990601, + "num_tokens": 163807872.0, + "step": 6461 + }, + { + "epoch": 0.7096419942894795, + "grad_norm": 2.039724826812744, + "learning_rate": 1e-06, + "loss": 0.9776, + "mean_token_accuracy": 0.6985313892364502, + "num_tokens": 163835304.0, + "step": 6462 + }, + { + "epoch": 0.7097518119920931, + "grad_norm": 2.3916220664978027, + "learning_rate": 1e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.7038842439651489, + "num_tokens": 163858570.0, + "step": 6463 + }, + { + "epoch": 0.7098616296947068, + "grad_norm": 2.0484163761138916, + "learning_rate": 1e-06, + "loss": 1.005, + "mean_token_accuracy": 0.695391058921814, + "num_tokens": 163888541.0, + "step": 6464 + }, + { + "epoch": 0.7099714473973204, + "grad_norm": 2.1826717853546143, + "learning_rate": 1e-06, + "loss": 1.057, + "mean_token_accuracy": 0.6725007891654968, + "num_tokens": 163913752.0, + "step": 6465 + }, + { + "epoch": 0.7100812650999341, + "grad_norm": 2.095123529434204, + "learning_rate": 1e-06, + "loss": 1.0455, + "mean_token_accuracy": 0.6892056465148926, + "num_tokens": 163941967.0, + "step": 6466 + }, + { + "epoch": 0.7101910828025477, + "grad_norm": 2.1807541847229004, + "learning_rate": 1e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.6951241493225098, + "num_tokens": 163968947.0, + "step": 6467 + }, + { + "epoch": 0.7103009005051615, + "grad_norm": 2.059527635574341, + "learning_rate": 1e-06, + "loss": 1.0895, + "mean_token_accuracy": 0.6782947778701782, + "num_tokens": 163999789.0, + "step": 6468 + }, + { + "epoch": 0.7104107182077751, + "grad_norm": 2.016249179840088, + "learning_rate": 1e-06, + "loss": 0.9969, + "mean_token_accuracy": 0.6891034841537476, + "num_tokens": 164031606.0, + "step": 6469 + }, + { + "epoch": 0.7105205359103888, + "grad_norm": 1.9237608909606934, + "learning_rate": 1e-06, + "loss": 0.9836, + "mean_token_accuracy": 0.7002906799316406, + "num_tokens": 164064657.0, + "step": 6470 + }, + { + "epoch": 0.7106303536130024, + "grad_norm": 2.2327120304107666, + "learning_rate": 1e-06, + "loss": 0.9803, + "mean_token_accuracy": 0.7020565271377563, + "num_tokens": 164089913.0, + "step": 6471 + }, + { + "epoch": 0.7107401713156161, + "grad_norm": 2.393251657485962, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.7046985626220703, + "num_tokens": 164111622.0, + "step": 6472 + }, + { + "epoch": 0.7108499890182297, + "grad_norm": 2.005828857421875, + "learning_rate": 1e-06, + "loss": 0.8901, + "mean_token_accuracy": 0.7302685976028442, + "num_tokens": 164138562.0, + "step": 6473 + }, + { + "epoch": 0.7109598067208434, + "grad_norm": 2.0855891704559326, + "learning_rate": 1e-06, + "loss": 1.0492, + "mean_token_accuracy": 0.686427116394043, + "num_tokens": 164167457.0, + "step": 6474 + }, + { + "epoch": 0.7110696244234571, + "grad_norm": 2.2478129863739014, + "learning_rate": 1e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.7086499929428101, + "num_tokens": 164190692.0, + "step": 6475 + }, + { + "epoch": 0.7111794421260708, + "grad_norm": 2.0428543090820312, + "learning_rate": 1e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.712649941444397, + "num_tokens": 164218047.0, + "step": 6476 + }, + { + "epoch": 0.7112892598286844, + "grad_norm": 2.0588552951812744, + "learning_rate": 1e-06, + "loss": 1.0886, + "mean_token_accuracy": 0.682112455368042, + "num_tokens": 164248225.0, + "step": 6477 + }, + { + "epoch": 0.711399077531298, + "grad_norm": 2.30588698387146, + "learning_rate": 1e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7001050114631653, + "num_tokens": 164273192.0, + "step": 6478 + }, + { + "epoch": 0.7115088952339117, + "grad_norm": 1.930947184562683, + "learning_rate": 1e-06, + "loss": 1.0212, + "mean_token_accuracy": 0.6873540282249451, + "num_tokens": 164305460.0, + "step": 6479 + }, + { + "epoch": 0.7116187129365253, + "grad_norm": 2.240471363067627, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7100696563720703, + "num_tokens": 164331732.0, + "step": 6480 + }, + { + "epoch": 0.711728530639139, + "grad_norm": 1.9568077325820923, + "learning_rate": 1e-06, + "loss": 0.9815, + "mean_token_accuracy": 0.6945970058441162, + "num_tokens": 164362518.0, + "step": 6481 + }, + { + "epoch": 0.7118383483417526, + "grad_norm": 2.065408706665039, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.7004572153091431, + "num_tokens": 164389847.0, + "step": 6482 + }, + { + "epoch": 0.7119481660443664, + "grad_norm": 2.2718703746795654, + "learning_rate": 1e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7036250829696655, + "num_tokens": 164413307.0, + "step": 6483 + }, + { + "epoch": 0.71205798374698, + "grad_norm": 2.2621893882751465, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.6964670419692993, + "num_tokens": 164435932.0, + "step": 6484 + }, + { + "epoch": 0.7121678014495937, + "grad_norm": 2.143613576889038, + "learning_rate": 1e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.6984196901321411, + "num_tokens": 164464440.0, + "step": 6485 + }, + { + "epoch": 0.7122776191522073, + "grad_norm": 2.083667278289795, + "learning_rate": 1e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.7012237310409546, + "num_tokens": 164490383.0, + "step": 6486 + }, + { + "epoch": 0.712387436854821, + "grad_norm": 2.3885602951049805, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7203219532966614, + "num_tokens": 164513205.0, + "step": 6487 + }, + { + "epoch": 0.7124972545574346, + "grad_norm": 2.375382661819458, + "learning_rate": 1e-06, + "loss": 0.9086, + "mean_token_accuracy": 0.7160186767578125, + "num_tokens": 164535533.0, + "step": 6488 + }, + { + "epoch": 0.7126070722600483, + "grad_norm": 2.5764153003692627, + "learning_rate": 1e-06, + "loss": 0.8599, + "mean_token_accuracy": 0.7260178327560425, + "num_tokens": 164554841.0, + "step": 6489 + }, + { + "epoch": 0.712716889962662, + "grad_norm": 2.180466651916504, + "learning_rate": 1e-06, + "loss": 0.969, + "mean_token_accuracy": 0.7086476683616638, + "num_tokens": 164578715.0, + "step": 6490 + }, + { + "epoch": 0.7128267076652757, + "grad_norm": 2.180450916290283, + "learning_rate": 1e-06, + "loss": 0.9453, + "mean_token_accuracy": 0.7189119458198547, + "num_tokens": 164605598.0, + "step": 6491 + }, + { + "epoch": 0.7129365253678893, + "grad_norm": 2.453130006790161, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7183542847633362, + "num_tokens": 164625301.0, + "step": 6492 + }, + { + "epoch": 0.713046343070503, + "grad_norm": 2.0219311714172363, + "learning_rate": 1e-06, + "loss": 1.016, + "mean_token_accuracy": 0.7029901742935181, + "num_tokens": 164657205.0, + "step": 6493 + }, + { + "epoch": 0.7131561607731166, + "grad_norm": 2.1701595783233643, + "learning_rate": 1e-06, + "loss": 1.0895, + "mean_token_accuracy": 0.6742353439331055, + "num_tokens": 164684480.0, + "step": 6494 + }, + { + "epoch": 0.7132659784757303, + "grad_norm": 2.182413339614868, + "learning_rate": 1e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.699662446975708, + "num_tokens": 164710794.0, + "step": 6495 + }, + { + "epoch": 0.7133757961783439, + "grad_norm": 2.0215249061584473, + "learning_rate": 1e-06, + "loss": 1.0086, + "mean_token_accuracy": 0.6919904351234436, + "num_tokens": 164741140.0, + "step": 6496 + }, + { + "epoch": 0.7134856138809577, + "grad_norm": 2.218007802963257, + "learning_rate": 1e-06, + "loss": 0.8737, + "mean_token_accuracy": 0.7259349822998047, + "num_tokens": 164766737.0, + "step": 6497 + }, + { + "epoch": 0.7135954315835713, + "grad_norm": 2.337414264678955, + "learning_rate": 1e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.7106444835662842, + "num_tokens": 164792747.0, + "step": 6498 + }, + { + "epoch": 0.713705249286185, + "grad_norm": 2.305102586746216, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.6998218894004822, + "num_tokens": 164817317.0, + "step": 6499 + }, + { + "epoch": 0.7138150669887986, + "grad_norm": 1.9741151332855225, + "learning_rate": 1e-06, + "loss": 0.9462, + "mean_token_accuracy": 0.7144789695739746, + "num_tokens": 164845832.0, + "step": 6500 + }, + { + "epoch": 0.7139248846914122, + "grad_norm": 2.119483709335327, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.6986039876937866, + "num_tokens": 164873255.0, + "step": 6501 + }, + { + "epoch": 0.7140347023940259, + "grad_norm": 2.151338577270508, + "learning_rate": 1e-06, + "loss": 0.9803, + "mean_token_accuracy": 0.6972283720970154, + "num_tokens": 164900012.0, + "step": 6502 + }, + { + "epoch": 0.7141445200966395, + "grad_norm": 2.178138256072998, + "learning_rate": 1e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.7086809873580933, + "num_tokens": 164924363.0, + "step": 6503 + }, + { + "epoch": 0.7142543377992533, + "grad_norm": 2.3757259845733643, + "learning_rate": 1e-06, + "loss": 0.8983, + "mean_token_accuracy": 0.7153856754302979, + "num_tokens": 164946578.0, + "step": 6504 + }, + { + "epoch": 0.7143641555018669, + "grad_norm": 1.8905227184295654, + "learning_rate": 1e-06, + "loss": 0.9806, + "mean_token_accuracy": 0.7090878486633301, + "num_tokens": 164977979.0, + "step": 6505 + }, + { + "epoch": 0.7144739732044806, + "grad_norm": 2.4945216178894043, + "learning_rate": 1e-06, + "loss": 0.8601, + "mean_token_accuracy": 0.7360402345657349, + "num_tokens": 164998292.0, + "step": 6506 + }, + { + "epoch": 0.7145837909070942, + "grad_norm": 2.13352108001709, + "learning_rate": 1e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.7042300701141357, + "num_tokens": 165024443.0, + "step": 6507 + }, + { + "epoch": 0.7146936086097079, + "grad_norm": 2.0246620178222656, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.6986443996429443, + "num_tokens": 165053132.0, + "step": 6508 + }, + { + "epoch": 0.7148034263123215, + "grad_norm": 2.0875301361083984, + "learning_rate": 1e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.7014054656028748, + "num_tokens": 165080475.0, + "step": 6509 + }, + { + "epoch": 0.7149132440149352, + "grad_norm": 1.8660272359848022, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7060608863830566, + "num_tokens": 165113495.0, + "step": 6510 + }, + { + "epoch": 0.7150230617175488, + "grad_norm": 2.3271665573120117, + "learning_rate": 1e-06, + "loss": 0.9594, + "mean_token_accuracy": 0.7030247449874878, + "num_tokens": 165137582.0, + "step": 6511 + }, + { + "epoch": 0.7151328794201626, + "grad_norm": 2.1987791061401367, + "learning_rate": 1e-06, + "loss": 1.073, + "mean_token_accuracy": 0.6797347068786621, + "num_tokens": 165166861.0, + "step": 6512 + }, + { + "epoch": 0.7152426971227762, + "grad_norm": 2.2638027667999268, + "learning_rate": 1e-06, + "loss": 1.0258, + "mean_token_accuracy": 0.6916670799255371, + "num_tokens": 165193325.0, + "step": 6513 + }, + { + "epoch": 0.7153525148253899, + "grad_norm": 2.322812080383301, + "learning_rate": 1e-06, + "loss": 1.0342, + "mean_token_accuracy": 0.682688295841217, + "num_tokens": 165218722.0, + "step": 6514 + }, + { + "epoch": 0.7154623325280035, + "grad_norm": 2.153900623321533, + "learning_rate": 1e-06, + "loss": 1.0242, + "mean_token_accuracy": 0.6937398314476013, + "num_tokens": 165245319.0, + "step": 6515 + }, + { + "epoch": 0.7155721502306172, + "grad_norm": 2.2707536220550537, + "learning_rate": 1e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7194221019744873, + "num_tokens": 165267350.0, + "step": 6516 + }, + { + "epoch": 0.7156819679332308, + "grad_norm": 2.0789618492126465, + "learning_rate": 1e-06, + "loss": 1.1218, + "mean_token_accuracy": 0.6602654457092285, + "num_tokens": 165296553.0, + "step": 6517 + }, + { + "epoch": 0.7157917856358444, + "grad_norm": 2.232910633087158, + "learning_rate": 1e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.7104265093803406, + "num_tokens": 165320009.0, + "step": 6518 + }, + { + "epoch": 0.7159016033384582, + "grad_norm": 2.3188846111297607, + "learning_rate": 1e-06, + "loss": 0.99, + "mean_token_accuracy": 0.7185183763504028, + "num_tokens": 165342922.0, + "step": 6519 + }, + { + "epoch": 0.7160114210410718, + "grad_norm": 2.374019145965576, + "learning_rate": 1e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.7196243405342102, + "num_tokens": 165365606.0, + "step": 6520 + }, + { + "epoch": 0.7161212387436855, + "grad_norm": 1.9113473892211914, + "learning_rate": 1e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.7232866883277893, + "num_tokens": 165396623.0, + "step": 6521 + }, + { + "epoch": 0.7162310564462991, + "grad_norm": 2.327803611755371, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7157943248748779, + "num_tokens": 165419429.0, + "step": 6522 + }, + { + "epoch": 0.7163408741489128, + "grad_norm": 2.006075859069824, + "learning_rate": 1e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.718208372592926, + "num_tokens": 165447153.0, + "step": 6523 + }, + { + "epoch": 0.7164506918515264, + "grad_norm": 2.22830867767334, + "learning_rate": 1e-06, + "loss": 0.9444, + "mean_token_accuracy": 0.7094476819038391, + "num_tokens": 165472591.0, + "step": 6524 + }, + { + "epoch": 0.7165605095541401, + "grad_norm": 2.4634618759155273, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.7148032188415527, + "num_tokens": 165492658.0, + "step": 6525 + }, + { + "epoch": 0.7166703272567538, + "grad_norm": 2.5276365280151367, + "learning_rate": 1e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7214244604110718, + "num_tokens": 165511781.0, + "step": 6526 + }, + { + "epoch": 0.7167801449593675, + "grad_norm": 2.1675267219543457, + "learning_rate": 1e-06, + "loss": 0.9837, + "mean_token_accuracy": 0.7001822590827942, + "num_tokens": 165536059.0, + "step": 6527 + }, + { + "epoch": 0.7168899626619811, + "grad_norm": 2.3101158142089844, + "learning_rate": 1e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.7067470550537109, + "num_tokens": 165558695.0, + "step": 6528 + }, + { + "epoch": 0.7169997803645948, + "grad_norm": 1.9359145164489746, + "learning_rate": 1e-06, + "loss": 1.0413, + "mean_token_accuracy": 0.6848587989807129, + "num_tokens": 165591387.0, + "step": 6529 + }, + { + "epoch": 0.7171095980672084, + "grad_norm": 2.097144603729248, + "learning_rate": 1e-06, + "loss": 1.0135, + "mean_token_accuracy": 0.6909899115562439, + "num_tokens": 165618557.0, + "step": 6530 + }, + { + "epoch": 0.7172194157698221, + "grad_norm": 2.194150686264038, + "learning_rate": 1e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.7121378779411316, + "num_tokens": 165642222.0, + "step": 6531 + }, + { + "epoch": 0.7173292334724357, + "grad_norm": 2.273265838623047, + "learning_rate": 1e-06, + "loss": 1.0204, + "mean_token_accuracy": 0.6905553936958313, + "num_tokens": 165667456.0, + "step": 6532 + }, + { + "epoch": 0.7174390511750495, + "grad_norm": 2.469999074935913, + "learning_rate": 1e-06, + "loss": 0.9607, + "mean_token_accuracy": 0.71231609582901, + "num_tokens": 165690230.0, + "step": 6533 + }, + { + "epoch": 0.7175488688776631, + "grad_norm": 2.2799394130706787, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7057597637176514, + "num_tokens": 165714620.0, + "step": 6534 + }, + { + "epoch": 0.7176586865802768, + "grad_norm": 2.241553544998169, + "learning_rate": 1e-06, + "loss": 0.8249, + "mean_token_accuracy": 0.7401557564735413, + "num_tokens": 165737358.0, + "step": 6535 + }, + { + "epoch": 0.7177685042828904, + "grad_norm": 2.285137414932251, + "learning_rate": 1e-06, + "loss": 0.9349, + "mean_token_accuracy": 0.7134829759597778, + "num_tokens": 165761787.0, + "step": 6536 + }, + { + "epoch": 0.717878321985504, + "grad_norm": 2.2697622776031494, + "learning_rate": 1e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.7098267078399658, + "num_tokens": 165786109.0, + "step": 6537 + }, + { + "epoch": 0.7179881396881177, + "grad_norm": 2.578197479248047, + "learning_rate": 1e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.7167160511016846, + "num_tokens": 165804243.0, + "step": 6538 + }, + { + "epoch": 0.7180979573907313, + "grad_norm": 2.322021722793579, + "learning_rate": 1e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.7012722492218018, + "num_tokens": 165827725.0, + "step": 6539 + }, + { + "epoch": 0.7182077750933451, + "grad_norm": 2.4669501781463623, + "learning_rate": 1e-06, + "loss": 0.8486, + "mean_token_accuracy": 0.7252014875411987, + "num_tokens": 165846254.0, + "step": 6540 + }, + { + "epoch": 0.7183175927959587, + "grad_norm": 2.26125168800354, + "learning_rate": 1e-06, + "loss": 1.0293, + "mean_token_accuracy": 0.6900578737258911, + "num_tokens": 165871753.0, + "step": 6541 + }, + { + "epoch": 0.7184274104985724, + "grad_norm": 2.2206833362579346, + "learning_rate": 1e-06, + "loss": 0.9856, + "mean_token_accuracy": 0.7142569422721863, + "num_tokens": 165896039.0, + "step": 6542 + }, + { + "epoch": 0.718537228201186, + "grad_norm": 2.137971878051758, + "learning_rate": 1e-06, + "loss": 0.846, + "mean_token_accuracy": 0.7354512810707092, + "num_tokens": 165920139.0, + "step": 6543 + }, + { + "epoch": 0.7186470459037997, + "grad_norm": 2.4984066486358643, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.7120404243469238, + "num_tokens": 165941775.0, + "step": 6544 + }, + { + "epoch": 0.7187568636064133, + "grad_norm": 2.4155499935150146, + "learning_rate": 1e-06, + "loss": 0.864, + "mean_token_accuracy": 0.7330862283706665, + "num_tokens": 165962451.0, + "step": 6545 + }, + { + "epoch": 0.718866681309027, + "grad_norm": 2.1478941440582275, + "learning_rate": 1e-06, + "loss": 1.0442, + "mean_token_accuracy": 0.6809950470924377, + "num_tokens": 165988403.0, + "step": 6546 + }, + { + "epoch": 0.7189764990116406, + "grad_norm": 2.7307822704315186, + "learning_rate": 1e-06, + "loss": 0.8862, + "mean_token_accuracy": 0.7202225923538208, + "num_tokens": 166007021.0, + "step": 6547 + }, + { + "epoch": 0.7190863167142544, + "grad_norm": 2.180711507797241, + "learning_rate": 1e-06, + "loss": 1.0188, + "mean_token_accuracy": 0.6897954344749451, + "num_tokens": 166034577.0, + "step": 6548 + }, + { + "epoch": 0.719196134416868, + "grad_norm": 2.3910019397735596, + "learning_rate": 1e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.7193930149078369, + "num_tokens": 166057106.0, + "step": 6549 + }, + { + "epoch": 0.7193059521194817, + "grad_norm": 2.0711307525634766, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7048917412757874, + "num_tokens": 166086707.0, + "step": 6550 + }, + { + "epoch": 0.7194157698220953, + "grad_norm": 2.129883289337158, + "learning_rate": 1e-06, + "loss": 0.9748, + "mean_token_accuracy": 0.7142040133476257, + "num_tokens": 166112785.0, + "step": 6551 + }, + { + "epoch": 0.719525587524709, + "grad_norm": 2.254049777984619, + "learning_rate": 1e-06, + "loss": 0.9787, + "mean_token_accuracy": 0.7043830156326294, + "num_tokens": 166137612.0, + "step": 6552 + }, + { + "epoch": 0.7196354052273226, + "grad_norm": 2.0115604400634766, + "learning_rate": 1e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.6963437795639038, + "num_tokens": 166165898.0, + "step": 6553 + }, + { + "epoch": 0.7197452229299363, + "grad_norm": 2.2059946060180664, + "learning_rate": 1e-06, + "loss": 1.0455, + "mean_token_accuracy": 0.6947356462478638, + "num_tokens": 166191543.0, + "step": 6554 + }, + { + "epoch": 0.71985504063255, + "grad_norm": 2.4386637210845947, + "learning_rate": 1e-06, + "loss": 0.954, + "mean_token_accuracy": 0.7001036405563354, + "num_tokens": 166214039.0, + "step": 6555 + }, + { + "epoch": 0.7199648583351637, + "grad_norm": 2.2241179943084717, + "learning_rate": 1e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7152390480041504, + "num_tokens": 166238105.0, + "step": 6556 + }, + { + "epoch": 0.7200746760377773, + "grad_norm": 2.039888620376587, + "learning_rate": 1e-06, + "loss": 0.91, + "mean_token_accuracy": 0.7171761989593506, + "num_tokens": 166265173.0, + "step": 6557 + }, + { + "epoch": 0.720184493740391, + "grad_norm": 2.396087408065796, + "learning_rate": 1e-06, + "loss": 0.9219, + "mean_token_accuracy": 0.7159693241119385, + "num_tokens": 166287827.0, + "step": 6558 + }, + { + "epoch": 0.7202943114430046, + "grad_norm": 2.5727269649505615, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7187472581863403, + "num_tokens": 166306665.0, + "step": 6559 + }, + { + "epoch": 0.7204041291456182, + "grad_norm": 2.290447950363159, + "learning_rate": 1e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.7202068567276001, + "num_tokens": 166330412.0, + "step": 6560 + }, + { + "epoch": 0.7205139468482319, + "grad_norm": 2.170708656311035, + "learning_rate": 1e-06, + "loss": 0.9256, + "mean_token_accuracy": 0.7162260413169861, + "num_tokens": 166354282.0, + "step": 6561 + }, + { + "epoch": 0.7206237645508456, + "grad_norm": 2.204190969467163, + "learning_rate": 1e-06, + "loss": 0.8497, + "mean_token_accuracy": 0.728028416633606, + "num_tokens": 166376387.0, + "step": 6562 + }, + { + "epoch": 0.7207335822534593, + "grad_norm": 2.4204211235046387, + "learning_rate": 1e-06, + "loss": 0.9651, + "mean_token_accuracy": 0.7033168077468872, + "num_tokens": 166397984.0, + "step": 6563 + }, + { + "epoch": 0.7208433999560729, + "grad_norm": 2.077540636062622, + "learning_rate": 1e-06, + "loss": 0.9037, + "mean_token_accuracy": 0.7209380865097046, + "num_tokens": 166426865.0, + "step": 6564 + }, + { + "epoch": 0.7209532176586866, + "grad_norm": 2.102644443511963, + "learning_rate": 1e-06, + "loss": 0.8795, + "mean_token_accuracy": 0.7266132235527039, + "num_tokens": 166454395.0, + "step": 6565 + }, + { + "epoch": 0.7210630353613002, + "grad_norm": 2.278745651245117, + "learning_rate": 1e-06, + "loss": 0.961, + "mean_token_accuracy": 0.697553813457489, + "num_tokens": 166479328.0, + "step": 6566 + }, + { + "epoch": 0.7211728530639139, + "grad_norm": 2.109586715698242, + "learning_rate": 1e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.7121490836143494, + "num_tokens": 166506095.0, + "step": 6567 + }, + { + "epoch": 0.7212826707665275, + "grad_norm": 2.150824546813965, + "learning_rate": 1e-06, + "loss": 1.0302, + "mean_token_accuracy": 0.6919293403625488, + "num_tokens": 166535200.0, + "step": 6568 + }, + { + "epoch": 0.7213924884691413, + "grad_norm": 2.4590275287628174, + "learning_rate": 1e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.7232515215873718, + "num_tokens": 166556010.0, + "step": 6569 + }, + { + "epoch": 0.7215023061717549, + "grad_norm": 2.135976552963257, + "learning_rate": 1e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.6929516196250916, + "num_tokens": 166584953.0, + "step": 6570 + }, + { + "epoch": 0.7216121238743686, + "grad_norm": 2.1107938289642334, + "learning_rate": 1e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.7147859334945679, + "num_tokens": 166612004.0, + "step": 6571 + }, + { + "epoch": 0.7217219415769822, + "grad_norm": 2.1416046619415283, + "learning_rate": 1e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.7090451121330261, + "num_tokens": 166637007.0, + "step": 6572 + }, + { + "epoch": 0.7218317592795959, + "grad_norm": 2.3267621994018555, + "learning_rate": 1e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.7121063470840454, + "num_tokens": 166659736.0, + "step": 6573 + }, + { + "epoch": 0.7219415769822095, + "grad_norm": 1.9436659812927246, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.708429753780365, + "num_tokens": 166690480.0, + "step": 6574 + }, + { + "epoch": 0.7220513946848232, + "grad_norm": 2.187627077102661, + "learning_rate": 1e-06, + "loss": 0.9949, + "mean_token_accuracy": 0.6968349814414978, + "num_tokens": 166715334.0, + "step": 6575 + }, + { + "epoch": 0.7221612123874368, + "grad_norm": 2.3175947666168213, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7088682055473328, + "num_tokens": 166740025.0, + "step": 6576 + }, + { + "epoch": 0.7222710300900506, + "grad_norm": 2.1271111965179443, + "learning_rate": 1e-06, + "loss": 1.0186, + "mean_token_accuracy": 0.6995753049850464, + "num_tokens": 166768124.0, + "step": 6577 + }, + { + "epoch": 0.7223808477926642, + "grad_norm": 2.3342628479003906, + "learning_rate": 1e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.7118415236473083, + "num_tokens": 166791806.0, + "step": 6578 + }, + { + "epoch": 0.7224906654952779, + "grad_norm": 2.2854321002960205, + "learning_rate": 1e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.6971869468688965, + "num_tokens": 166817146.0, + "step": 6579 + }, + { + "epoch": 0.7226004831978915, + "grad_norm": 2.1152725219726562, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.7143902778625488, + "num_tokens": 166844973.0, + "step": 6580 + }, + { + "epoch": 0.7227103009005051, + "grad_norm": 2.2153160572052, + "learning_rate": 1e-06, + "loss": 1.0531, + "mean_token_accuracy": 0.6784918904304504, + "num_tokens": 166871935.0, + "step": 6581 + }, + { + "epoch": 0.7228201186031188, + "grad_norm": 2.363542318344116, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7047570943832397, + "num_tokens": 166894585.0, + "step": 6582 + }, + { + "epoch": 0.7229299363057324, + "grad_norm": 2.471889019012451, + "learning_rate": 1e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.7041181921958923, + "num_tokens": 166914184.0, + "step": 6583 + }, + { + "epoch": 0.7230397540083462, + "grad_norm": 2.167469024658203, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.702093243598938, + "num_tokens": 166940575.0, + "step": 6584 + }, + { + "epoch": 0.7231495717109598, + "grad_norm": 2.0109152793884277, + "learning_rate": 1e-06, + "loss": 1.0353, + "mean_token_accuracy": 0.6873587965965271, + "num_tokens": 166970239.0, + "step": 6585 + }, + { + "epoch": 0.7232593894135735, + "grad_norm": 2.3875279426574707, + "learning_rate": 1e-06, + "loss": 1.01, + "mean_token_accuracy": 0.6978054046630859, + "num_tokens": 166993031.0, + "step": 6586 + }, + { + "epoch": 0.7233692071161871, + "grad_norm": 2.0646417140960693, + "learning_rate": 1e-06, + "loss": 1.0076, + "mean_token_accuracy": 0.6955294609069824, + "num_tokens": 167021619.0, + "step": 6587 + }, + { + "epoch": 0.7234790248188008, + "grad_norm": 2.2742249965667725, + "learning_rate": 1e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.7215201258659363, + "num_tokens": 167045500.0, + "step": 6588 + }, + { + "epoch": 0.7235888425214144, + "grad_norm": 2.2167036533355713, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7095754742622375, + "num_tokens": 167071417.0, + "step": 6589 + }, + { + "epoch": 0.7236986602240281, + "grad_norm": 2.2861101627349854, + "learning_rate": 1e-06, + "loss": 0.9349, + "mean_token_accuracy": 0.7116495370864868, + "num_tokens": 167093960.0, + "step": 6590 + }, + { + "epoch": 0.7238084779266418, + "grad_norm": 2.351349353790283, + "learning_rate": 1e-06, + "loss": 0.9766, + "mean_token_accuracy": 0.701469361782074, + "num_tokens": 167117390.0, + "step": 6591 + }, + { + "epoch": 0.7239182956292555, + "grad_norm": 2.210365056991577, + "learning_rate": 1e-06, + "loss": 0.9799, + "mean_token_accuracy": 0.7017251253128052, + "num_tokens": 167142034.0, + "step": 6592 + }, + { + "epoch": 0.7240281133318691, + "grad_norm": 2.2953388690948486, + "learning_rate": 1e-06, + "loss": 0.8898, + "mean_token_accuracy": 0.7242460250854492, + "num_tokens": 167165779.0, + "step": 6593 + }, + { + "epoch": 0.7241379310344828, + "grad_norm": 2.044107675552368, + "learning_rate": 1e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.7056159973144531, + "num_tokens": 167193980.0, + "step": 6594 + }, + { + "epoch": 0.7242477487370964, + "grad_norm": 2.5864157676696777, + "learning_rate": 1e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.723590612411499, + "num_tokens": 167212802.0, + "step": 6595 + }, + { + "epoch": 0.72435756643971, + "grad_norm": 2.274371385574341, + "learning_rate": 1e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.7020015716552734, + "num_tokens": 167235466.0, + "step": 6596 + }, + { + "epoch": 0.7244673841423237, + "grad_norm": 2.234158515930176, + "learning_rate": 1e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7140660285949707, + "num_tokens": 167260963.0, + "step": 6597 + }, + { + "epoch": 0.7245772018449375, + "grad_norm": 2.0367634296417236, + "learning_rate": 1e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7098267078399658, + "num_tokens": 167288879.0, + "step": 6598 + }, + { + "epoch": 0.7246870195475511, + "grad_norm": 2.01928448677063, + "learning_rate": 1e-06, + "loss": 1.0148, + "mean_token_accuracy": 0.6925963759422302, + "num_tokens": 167318400.0, + "step": 6599 + }, + { + "epoch": 0.7247968372501647, + "grad_norm": 2.2783620357513428, + "learning_rate": 1e-06, + "loss": 0.9282, + "mean_token_accuracy": 0.7218361496925354, + "num_tokens": 167342512.0, + "step": 6600 + }, + { + "epoch": 0.7249066549527784, + "grad_norm": 2.3332877159118652, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.7005404829978943, + "num_tokens": 167367939.0, + "step": 6601 + }, + { + "epoch": 0.725016472655392, + "grad_norm": 2.2401599884033203, + "learning_rate": 1e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7072407007217407, + "num_tokens": 167391720.0, + "step": 6602 + }, + { + "epoch": 0.7251262903580057, + "grad_norm": 2.1479055881500244, + "learning_rate": 1e-06, + "loss": 1.0096, + "mean_token_accuracy": 0.6865783929824829, + "num_tokens": 167418175.0, + "step": 6603 + }, + { + "epoch": 0.7252361080606193, + "grad_norm": 2.1783900260925293, + "learning_rate": 1e-06, + "loss": 0.8564, + "mean_token_accuracy": 0.7349890470504761, + "num_tokens": 167440805.0, + "step": 6604 + }, + { + "epoch": 0.725345925763233, + "grad_norm": 2.3695321083068848, + "learning_rate": 1e-06, + "loss": 0.8756, + "mean_token_accuracy": 0.726777970790863, + "num_tokens": 167461728.0, + "step": 6605 + }, + { + "epoch": 0.7254557434658467, + "grad_norm": 2.04787540435791, + "learning_rate": 1e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7087118625640869, + "num_tokens": 167491083.0, + "step": 6606 + }, + { + "epoch": 0.7255655611684604, + "grad_norm": 2.4623870849609375, + "learning_rate": 1e-06, + "loss": 0.9046, + "mean_token_accuracy": 0.7226930260658264, + "num_tokens": 167512133.0, + "step": 6607 + }, + { + "epoch": 0.725675378871074, + "grad_norm": 2.1721582412719727, + "learning_rate": 1e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.7187979817390442, + "num_tokens": 167538285.0, + "step": 6608 + }, + { + "epoch": 0.7257851965736877, + "grad_norm": 2.036980628967285, + "learning_rate": 1e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.6947011947631836, + "num_tokens": 167568607.0, + "step": 6609 + }, + { + "epoch": 0.7258950142763013, + "grad_norm": 2.4839224815368652, + "learning_rate": 1e-06, + "loss": 0.8644, + "mean_token_accuracy": 0.7311664819717407, + "num_tokens": 167586875.0, + "step": 6610 + }, + { + "epoch": 0.726004831978915, + "grad_norm": 2.141923666000366, + "learning_rate": 1e-06, + "loss": 0.8736, + "mean_token_accuracy": 0.7276335954666138, + "num_tokens": 167611284.0, + "step": 6611 + }, + { + "epoch": 0.7261146496815286, + "grad_norm": 2.242912769317627, + "learning_rate": 1e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.7043240070343018, + "num_tokens": 167634853.0, + "step": 6612 + }, + { + "epoch": 0.7262244673841424, + "grad_norm": 2.271507740020752, + "learning_rate": 1e-06, + "loss": 0.9945, + "mean_token_accuracy": 0.6961755752563477, + "num_tokens": 167660207.0, + "step": 6613 + }, + { + "epoch": 0.726334285086756, + "grad_norm": 2.1377522945404053, + "learning_rate": 1e-06, + "loss": 1.0346, + "mean_token_accuracy": 0.6800804138183594, + "num_tokens": 167686277.0, + "step": 6614 + }, + { + "epoch": 0.7264441027893697, + "grad_norm": 2.622589111328125, + "learning_rate": 1e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.7054846286773682, + "num_tokens": 167706654.0, + "step": 6615 + }, + { + "epoch": 0.7265539204919833, + "grad_norm": 1.9750347137451172, + "learning_rate": 1e-06, + "loss": 0.9206, + "mean_token_accuracy": 0.7215404510498047, + "num_tokens": 167736988.0, + "step": 6616 + }, + { + "epoch": 0.726663738194597, + "grad_norm": 2.43923282623291, + "learning_rate": 1e-06, + "loss": 0.9256, + "mean_token_accuracy": 0.7199702262878418, + "num_tokens": 167758184.0, + "step": 6617 + }, + { + "epoch": 0.7267735558972106, + "grad_norm": 2.309363603591919, + "learning_rate": 1e-06, + "loss": 1.0273, + "mean_token_accuracy": 0.6892673373222351, + "num_tokens": 167781411.0, + "step": 6618 + }, + { + "epoch": 0.7268833735998242, + "grad_norm": 2.3343231678009033, + "learning_rate": 1e-06, + "loss": 1.0061, + "mean_token_accuracy": 0.6962805986404419, + "num_tokens": 167805226.0, + "step": 6619 + }, + { + "epoch": 0.726993191302438, + "grad_norm": 2.029676675796509, + "learning_rate": 1e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.7193652987480164, + "num_tokens": 167833597.0, + "step": 6620 + }, + { + "epoch": 0.7271030090050516, + "grad_norm": 2.0152807235717773, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.717205286026001, + "num_tokens": 167864954.0, + "step": 6621 + }, + { + "epoch": 0.7272128267076653, + "grad_norm": 1.9646382331848145, + "learning_rate": 1e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.6997314691543579, + "num_tokens": 167895397.0, + "step": 6622 + }, + { + "epoch": 0.7273226444102789, + "grad_norm": 2.29081654548645, + "learning_rate": 1e-06, + "loss": 1.0117, + "mean_token_accuracy": 0.695796012878418, + "num_tokens": 167922292.0, + "step": 6623 + }, + { + "epoch": 0.7274324621128926, + "grad_norm": 2.1842031478881836, + "learning_rate": 1e-06, + "loss": 1.0739, + "mean_token_accuracy": 0.6751395463943481, + "num_tokens": 167947250.0, + "step": 6624 + }, + { + "epoch": 0.7275422798155062, + "grad_norm": 2.4062020778656006, + "learning_rate": 1e-06, + "loss": 1.0082, + "mean_token_accuracy": 0.6961808800697327, + "num_tokens": 167969379.0, + "step": 6625 + }, + { + "epoch": 0.7276520975181199, + "grad_norm": 2.370084524154663, + "learning_rate": 1e-06, + "loss": 0.8672, + "mean_token_accuracy": 0.7307524681091309, + "num_tokens": 167990067.0, + "step": 6626 + }, + { + "epoch": 0.7277619152207336, + "grad_norm": 1.950814962387085, + "learning_rate": 1e-06, + "loss": 0.9963, + "mean_token_accuracy": 0.6988860368728638, + "num_tokens": 168020536.0, + "step": 6627 + }, + { + "epoch": 0.7278717329233473, + "grad_norm": 2.4133238792419434, + "learning_rate": 1e-06, + "loss": 0.9266, + "mean_token_accuracy": 0.7216848134994507, + "num_tokens": 168042705.0, + "step": 6628 + }, + { + "epoch": 0.7279815506259609, + "grad_norm": 2.076852798461914, + "learning_rate": 1e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.6988651752471924, + "num_tokens": 168071313.0, + "step": 6629 + }, + { + "epoch": 0.7280913683285746, + "grad_norm": 2.3671507835388184, + "learning_rate": 1e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.7102035880088806, + "num_tokens": 168092638.0, + "step": 6630 + }, + { + "epoch": 0.7282011860311882, + "grad_norm": 1.9430798292160034, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.7065492272377014, + "num_tokens": 168123573.0, + "step": 6631 + }, + { + "epoch": 0.7283110037338019, + "grad_norm": 2.1427648067474365, + "learning_rate": 1e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.6983520984649658, + "num_tokens": 168147513.0, + "step": 6632 + }, + { + "epoch": 0.7284208214364155, + "grad_norm": 2.2966580390930176, + "learning_rate": 1e-06, + "loss": 0.8119, + "mean_token_accuracy": 0.7437450885772705, + "num_tokens": 168168414.0, + "step": 6633 + }, + { + "epoch": 0.7285306391390292, + "grad_norm": 2.3738749027252197, + "learning_rate": 1e-06, + "loss": 0.9281, + "mean_token_accuracy": 0.7183492183685303, + "num_tokens": 168191230.0, + "step": 6634 + }, + { + "epoch": 0.7286404568416429, + "grad_norm": 2.232492446899414, + "learning_rate": 1e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.7158510088920593, + "num_tokens": 168215526.0, + "step": 6635 + }, + { + "epoch": 0.7287502745442566, + "grad_norm": 1.9502755403518677, + "learning_rate": 1e-06, + "loss": 1.0298, + "mean_token_accuracy": 0.6958852410316467, + "num_tokens": 168247105.0, + "step": 6636 + }, + { + "epoch": 0.7288600922468702, + "grad_norm": 2.307736873626709, + "learning_rate": 1e-06, + "loss": 0.9586, + "mean_token_accuracy": 0.7064998149871826, + "num_tokens": 168270289.0, + "step": 6637 + }, + { + "epoch": 0.7289699099494839, + "grad_norm": 2.041764974594116, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.7141232490539551, + "num_tokens": 168299124.0, + "step": 6638 + }, + { + "epoch": 0.7290797276520975, + "grad_norm": 2.170718193054199, + "learning_rate": 1e-06, + "loss": 0.8879, + "mean_token_accuracy": 0.7226772904396057, + "num_tokens": 168322820.0, + "step": 6639 + }, + { + "epoch": 0.7291895453547111, + "grad_norm": 2.204633951187134, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7036820650100708, + "num_tokens": 168346802.0, + "step": 6640 + }, + { + "epoch": 0.7292993630573248, + "grad_norm": 2.329857349395752, + "learning_rate": 1e-06, + "loss": 0.9839, + "mean_token_accuracy": 0.7135919332504272, + "num_tokens": 168370239.0, + "step": 6641 + }, + { + "epoch": 0.7294091807599385, + "grad_norm": 2.141692638397217, + "learning_rate": 1e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.6898559927940369, + "num_tokens": 168396256.0, + "step": 6642 + }, + { + "epoch": 0.7295189984625522, + "grad_norm": 1.9980566501617432, + "learning_rate": 1e-06, + "loss": 0.9954, + "mean_token_accuracy": 0.6980587244033813, + "num_tokens": 168425872.0, + "step": 6643 + }, + { + "epoch": 0.7296288161651658, + "grad_norm": 2.0736639499664307, + "learning_rate": 1e-06, + "loss": 0.9614, + "mean_token_accuracy": 0.7021493911743164, + "num_tokens": 168453003.0, + "step": 6644 + }, + { + "epoch": 0.7297386338677795, + "grad_norm": 2.1128430366516113, + "learning_rate": 1e-06, + "loss": 0.9749, + "mean_token_accuracy": 0.7044883966445923, + "num_tokens": 168481589.0, + "step": 6645 + }, + { + "epoch": 0.7298484515703931, + "grad_norm": 2.0076494216918945, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.7130166292190552, + "num_tokens": 168510055.0, + "step": 6646 + }, + { + "epoch": 0.7299582692730068, + "grad_norm": 2.133859395980835, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7038508057594299, + "num_tokens": 168533645.0, + "step": 6647 + }, + { + "epoch": 0.7300680869756204, + "grad_norm": 2.443842649459839, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7151427268981934, + "num_tokens": 168554723.0, + "step": 6648 + }, + { + "epoch": 0.7301779046782342, + "grad_norm": 2.466038942337036, + "learning_rate": 1e-06, + "loss": 0.8285, + "mean_token_accuracy": 0.7420940399169922, + "num_tokens": 168573231.0, + "step": 6649 + }, + { + "epoch": 0.7302877223808478, + "grad_norm": 2.1666295528411865, + "learning_rate": 1e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.7253716588020325, + "num_tokens": 168596544.0, + "step": 6650 + }, + { + "epoch": 0.7303975400834615, + "grad_norm": 2.007438898086548, + "learning_rate": 1e-06, + "loss": 1.0746, + "mean_token_accuracy": 0.6794520616531372, + "num_tokens": 168629141.0, + "step": 6651 + }, + { + "epoch": 0.7305073577860751, + "grad_norm": 2.261906147003174, + "learning_rate": 1e-06, + "loss": 0.8844, + "mean_token_accuracy": 0.7206094264984131, + "num_tokens": 168652232.0, + "step": 6652 + }, + { + "epoch": 0.7306171754886888, + "grad_norm": 2.3395638465881348, + "learning_rate": 1e-06, + "loss": 0.9268, + "mean_token_accuracy": 0.712032675743103, + "num_tokens": 168673842.0, + "step": 6653 + }, + { + "epoch": 0.7307269931913024, + "grad_norm": 2.1592910289764404, + "learning_rate": 1e-06, + "loss": 0.8724, + "mean_token_accuracy": 0.7270253300666809, + "num_tokens": 168698235.0, + "step": 6654 + }, + { + "epoch": 0.7308368108939161, + "grad_norm": 2.3168015480041504, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.7006227970123291, + "num_tokens": 168721845.0, + "step": 6655 + }, + { + "epoch": 0.7309466285965298, + "grad_norm": 2.2801265716552734, + "learning_rate": 1e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.7081726789474487, + "num_tokens": 168744375.0, + "step": 6656 + }, + { + "epoch": 0.7310564462991435, + "grad_norm": 2.326535940170288, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.6919607520103455, + "num_tokens": 168769537.0, + "step": 6657 + }, + { + "epoch": 0.7311662640017571, + "grad_norm": 2.261220932006836, + "learning_rate": 1e-06, + "loss": 1.0419, + "mean_token_accuracy": 0.6931757926940918, + "num_tokens": 168795874.0, + "step": 6658 + }, + { + "epoch": 0.7312760817043708, + "grad_norm": 2.0560944080352783, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.702899694442749, + "num_tokens": 168825572.0, + "step": 6659 + }, + { + "epoch": 0.7313858994069844, + "grad_norm": 2.1072142124176025, + "learning_rate": 1e-06, + "loss": 0.9912, + "mean_token_accuracy": 0.6959071159362793, + "num_tokens": 168853374.0, + "step": 6660 + }, + { + "epoch": 0.731495717109598, + "grad_norm": 2.1030566692352295, + "learning_rate": 1e-06, + "loss": 0.9945, + "mean_token_accuracy": 0.6983050107955933, + "num_tokens": 168882853.0, + "step": 6661 + }, + { + "epoch": 0.7316055348122117, + "grad_norm": 2.4279658794403076, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7237453460693359, + "num_tokens": 168904547.0, + "step": 6662 + }, + { + "epoch": 0.7317153525148253, + "grad_norm": 2.139138698577881, + "learning_rate": 1e-06, + "loss": 1.0061, + "mean_token_accuracy": 0.6932990550994873, + "num_tokens": 168931053.0, + "step": 6663 + }, + { + "epoch": 0.7318251702174391, + "grad_norm": 2.1992738246917725, + "learning_rate": 1e-06, + "loss": 0.8834, + "mean_token_accuracy": 0.730090856552124, + "num_tokens": 168955505.0, + "step": 6664 + }, + { + "epoch": 0.7319349879200527, + "grad_norm": 2.1891956329345703, + "learning_rate": 1e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.6985872387886047, + "num_tokens": 168983756.0, + "step": 6665 + }, + { + "epoch": 0.7320448056226664, + "grad_norm": 2.5630640983581543, + "learning_rate": 1e-06, + "loss": 0.8756, + "mean_token_accuracy": 0.738610029220581, + "num_tokens": 169002825.0, + "step": 6666 + }, + { + "epoch": 0.73215462332528, + "grad_norm": 2.7303555011749268, + "learning_rate": 1e-06, + "loss": 0.8636, + "mean_token_accuracy": 0.7284040451049805, + "num_tokens": 169019126.0, + "step": 6667 + }, + { + "epoch": 0.7322644410278937, + "grad_norm": 1.9499057531356812, + "learning_rate": 1e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.7023828029632568, + "num_tokens": 169048915.0, + "step": 6668 + }, + { + "epoch": 0.7323742587305073, + "grad_norm": 2.1558518409729004, + "learning_rate": 1e-06, + "loss": 0.9995, + "mean_token_accuracy": 0.694674015045166, + "num_tokens": 169077247.0, + "step": 6669 + }, + { + "epoch": 0.732484076433121, + "grad_norm": 2.2319061756134033, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7171204090118408, + "num_tokens": 169100609.0, + "step": 6670 + }, + { + "epoch": 0.7325938941357347, + "grad_norm": 2.1313588619232178, + "learning_rate": 1e-06, + "loss": 0.9587, + "mean_token_accuracy": 0.7035622000694275, + "num_tokens": 169127077.0, + "step": 6671 + }, + { + "epoch": 0.7327037118383484, + "grad_norm": 2.255183219909668, + "learning_rate": 1e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.7150366306304932, + "num_tokens": 169150719.0, + "step": 6672 + }, + { + "epoch": 0.732813529540962, + "grad_norm": 2.015237808227539, + "learning_rate": 1e-06, + "loss": 0.8871, + "mean_token_accuracy": 0.7287449836730957, + "num_tokens": 169179535.0, + "step": 6673 + }, + { + "epoch": 0.7329233472435757, + "grad_norm": 1.9229284524917603, + "learning_rate": 1e-06, + "loss": 1.0428, + "mean_token_accuracy": 0.6869387030601501, + "num_tokens": 169213564.0, + "step": 6674 + }, + { + "epoch": 0.7330331649461893, + "grad_norm": 2.4775338172912598, + "learning_rate": 1e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.7189476490020752, + "num_tokens": 169235142.0, + "step": 6675 + }, + { + "epoch": 0.733142982648803, + "grad_norm": 2.3625025749206543, + "learning_rate": 1e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.7157322764396667, + "num_tokens": 169257462.0, + "step": 6676 + }, + { + "epoch": 0.7332528003514166, + "grad_norm": 2.0962257385253906, + "learning_rate": 1e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.7149850726127625, + "num_tokens": 169283714.0, + "step": 6677 + }, + { + "epoch": 0.7333626180540304, + "grad_norm": 2.1166648864746094, + "learning_rate": 1e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.7243419885635376, + "num_tokens": 169309628.0, + "step": 6678 + }, + { + "epoch": 0.733472435756644, + "grad_norm": 2.390589475631714, + "learning_rate": 1e-06, + "loss": 0.9055, + "mean_token_accuracy": 0.7179657816886902, + "num_tokens": 169329988.0, + "step": 6679 + }, + { + "epoch": 0.7335822534592576, + "grad_norm": 2.1027274131774902, + "learning_rate": 1e-06, + "loss": 1.0581, + "mean_token_accuracy": 0.682161271572113, + "num_tokens": 169357534.0, + "step": 6680 + }, + { + "epoch": 0.7336920711618713, + "grad_norm": 2.10306978225708, + "learning_rate": 1e-06, + "loss": 1.0539, + "mean_token_accuracy": 0.6828506588935852, + "num_tokens": 169386552.0, + "step": 6681 + }, + { + "epoch": 0.7338018888644849, + "grad_norm": 1.921446442604065, + "learning_rate": 1e-06, + "loss": 0.943, + "mean_token_accuracy": 0.7119686007499695, + "num_tokens": 169418015.0, + "step": 6682 + }, + { + "epoch": 0.7339117065670986, + "grad_norm": 1.864010214805603, + "learning_rate": 1e-06, + "loss": 1.0147, + "mean_token_accuracy": 0.6942189931869507, + "num_tokens": 169453698.0, + "step": 6683 + }, + { + "epoch": 0.7340215242697122, + "grad_norm": 2.2824482917785645, + "learning_rate": 1e-06, + "loss": 1.0246, + "mean_token_accuracy": 0.6950159072875977, + "num_tokens": 169478609.0, + "step": 6684 + }, + { + "epoch": 0.734131341972326, + "grad_norm": 2.232490301132202, + "learning_rate": 1e-06, + "loss": 1.0488, + "mean_token_accuracy": 0.6974548697471619, + "num_tokens": 169503962.0, + "step": 6685 + }, + { + "epoch": 0.7342411596749396, + "grad_norm": 2.0867843627929688, + "learning_rate": 1e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.7231284379959106, + "num_tokens": 169533188.0, + "step": 6686 + }, + { + "epoch": 0.7343509773775533, + "grad_norm": 2.081667900085449, + "learning_rate": 1e-06, + "loss": 1.0744, + "mean_token_accuracy": 0.6695274114608765, + "num_tokens": 169562574.0, + "step": 6687 + }, + { + "epoch": 0.7344607950801669, + "grad_norm": 2.2970027923583984, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.7154254913330078, + "num_tokens": 169588497.0, + "step": 6688 + }, + { + "epoch": 0.7345706127827806, + "grad_norm": 2.0395302772521973, + "learning_rate": 1e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.692527174949646, + "num_tokens": 169616963.0, + "step": 6689 + }, + { + "epoch": 0.7346804304853942, + "grad_norm": 2.1554415225982666, + "learning_rate": 1e-06, + "loss": 0.953, + "mean_token_accuracy": 0.7074945569038391, + "num_tokens": 169643662.0, + "step": 6690 + }, + { + "epoch": 0.7347902481880079, + "grad_norm": 2.3359007835388184, + "learning_rate": 1e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.7197858095169067, + "num_tokens": 169665774.0, + "step": 6691 + }, + { + "epoch": 0.7349000658906216, + "grad_norm": 2.1208267211914062, + "learning_rate": 1e-06, + "loss": 1.054, + "mean_token_accuracy": 0.6866714954376221, + "num_tokens": 169693319.0, + "step": 6692 + }, + { + "epoch": 0.7350098835932353, + "grad_norm": 2.0344948768615723, + "learning_rate": 1e-06, + "loss": 0.9924, + "mean_token_accuracy": 0.7059866189956665, + "num_tokens": 169721847.0, + "step": 6693 + }, + { + "epoch": 0.7351197012958489, + "grad_norm": 2.0210089683532715, + "learning_rate": 1e-06, + "loss": 0.9097, + "mean_token_accuracy": 0.7207381129264832, + "num_tokens": 169750463.0, + "step": 6694 + }, + { + "epoch": 0.7352295189984626, + "grad_norm": 2.498143196105957, + "learning_rate": 1e-06, + "loss": 0.9356, + "mean_token_accuracy": 0.7130886316299438, + "num_tokens": 169769665.0, + "step": 6695 + }, + { + "epoch": 0.7353393367010762, + "grad_norm": 2.293734073638916, + "learning_rate": 1e-06, + "loss": 1.0314, + "mean_token_accuracy": 0.6876320242881775, + "num_tokens": 169795449.0, + "step": 6696 + }, + { + "epoch": 0.7354491544036899, + "grad_norm": 2.2197868824005127, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.7026099562644958, + "num_tokens": 169819628.0, + "step": 6697 + }, + { + "epoch": 0.7355589721063035, + "grad_norm": 2.2091445922851562, + "learning_rate": 1e-06, + "loss": 0.9975, + "mean_token_accuracy": 0.6965474486351013, + "num_tokens": 169847124.0, + "step": 6698 + }, + { + "epoch": 0.7356687898089171, + "grad_norm": 2.316895008087158, + "learning_rate": 1e-06, + "loss": 0.9179, + "mean_token_accuracy": 0.7139086723327637, + "num_tokens": 169872112.0, + "step": 6699 + }, + { + "epoch": 0.7357786075115309, + "grad_norm": 2.3569765090942383, + "learning_rate": 1e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.7231849431991577, + "num_tokens": 169892606.0, + "step": 6700 + }, + { + "epoch": 0.7358884252141445, + "grad_norm": 2.2905731201171875, + "learning_rate": 1e-06, + "loss": 0.9546, + "mean_token_accuracy": 0.7094688415527344, + "num_tokens": 169915071.0, + "step": 6701 + }, + { + "epoch": 0.7359982429167582, + "grad_norm": 1.8961820602416992, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7176686525344849, + "num_tokens": 169946677.0, + "step": 6702 + }, + { + "epoch": 0.7361080606193718, + "grad_norm": 1.8562886714935303, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7129980325698853, + "num_tokens": 169978224.0, + "step": 6703 + }, + { + "epoch": 0.7362178783219855, + "grad_norm": 2.4536025524139404, + "learning_rate": 1e-06, + "loss": 0.8746, + "mean_token_accuracy": 0.7331287860870361, + "num_tokens": 169998782.0, + "step": 6704 + }, + { + "epoch": 0.7363276960245991, + "grad_norm": 2.5408523082733154, + "learning_rate": 1e-06, + "loss": 0.8619, + "mean_token_accuracy": 0.7412590980529785, + "num_tokens": 170018539.0, + "step": 6705 + }, + { + "epoch": 0.7364375137272128, + "grad_norm": 2.574518918991089, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7050985097885132, + "num_tokens": 170038012.0, + "step": 6706 + }, + { + "epoch": 0.7365473314298265, + "grad_norm": 2.2162973880767822, + "learning_rate": 1e-06, + "loss": 0.8748, + "mean_token_accuracy": 0.7308098077774048, + "num_tokens": 170061466.0, + "step": 6707 + }, + { + "epoch": 0.7366571491324402, + "grad_norm": 2.3803977966308594, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7177741527557373, + "num_tokens": 170082609.0, + "step": 6708 + }, + { + "epoch": 0.7367669668350538, + "grad_norm": 2.1193666458129883, + "learning_rate": 1e-06, + "loss": 1.0086, + "mean_token_accuracy": 0.6919953227043152, + "num_tokens": 170111788.0, + "step": 6709 + }, + { + "epoch": 0.7368767845376675, + "grad_norm": 2.5051991939544678, + "learning_rate": 1e-06, + "loss": 0.9825, + "mean_token_accuracy": 0.6939051151275635, + "num_tokens": 170133424.0, + "step": 6710 + }, + { + "epoch": 0.7369866022402811, + "grad_norm": 2.4511513710021973, + "learning_rate": 1e-06, + "loss": 0.905, + "mean_token_accuracy": 0.7207053303718567, + "num_tokens": 170153563.0, + "step": 6711 + }, + { + "epoch": 0.7370964199428948, + "grad_norm": 2.0504891872406006, + "learning_rate": 1e-06, + "loss": 1.04, + "mean_token_accuracy": 0.6903845071792603, + "num_tokens": 170183264.0, + "step": 6712 + }, + { + "epoch": 0.7372062376455084, + "grad_norm": 2.1782283782958984, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.7032682299613953, + "num_tokens": 170207900.0, + "step": 6713 + }, + { + "epoch": 0.7373160553481222, + "grad_norm": 2.0875537395477295, + "learning_rate": 1e-06, + "loss": 0.9509, + "mean_token_accuracy": 0.7190618515014648, + "num_tokens": 170233840.0, + "step": 6714 + }, + { + "epoch": 0.7374258730507358, + "grad_norm": 2.0279784202575684, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7035423517227173, + "num_tokens": 170264377.0, + "step": 6715 + }, + { + "epoch": 0.7375356907533495, + "grad_norm": 2.075396776199341, + "learning_rate": 1e-06, + "loss": 1.0009, + "mean_token_accuracy": 0.6984535455703735, + "num_tokens": 170291496.0, + "step": 6716 + }, + { + "epoch": 0.7376455084559631, + "grad_norm": 2.5058860778808594, + "learning_rate": 1e-06, + "loss": 1.0242, + "mean_token_accuracy": 0.6878895163536072, + "num_tokens": 170313507.0, + "step": 6717 + }, + { + "epoch": 0.7377553261585768, + "grad_norm": 2.2587976455688477, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7105826735496521, + "num_tokens": 170336719.0, + "step": 6718 + }, + { + "epoch": 0.7378651438611904, + "grad_norm": 2.3403055667877197, + "learning_rate": 1e-06, + "loss": 0.9471, + "mean_token_accuracy": 0.7085756063461304, + "num_tokens": 170359973.0, + "step": 6719 + }, + { + "epoch": 0.737974961563804, + "grad_norm": 2.372968912124634, + "learning_rate": 1e-06, + "loss": 0.8836, + "mean_token_accuracy": 0.7249821424484253, + "num_tokens": 170382611.0, + "step": 6720 + }, + { + "epoch": 0.7380847792664178, + "grad_norm": 2.2952616214752197, + "learning_rate": 1e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.7101218700408936, + "num_tokens": 170405878.0, + "step": 6721 + }, + { + "epoch": 0.7381945969690314, + "grad_norm": 2.272691249847412, + "learning_rate": 1e-06, + "loss": 0.9774, + "mean_token_accuracy": 0.6999855041503906, + "num_tokens": 170428921.0, + "step": 6722 + }, + { + "epoch": 0.7383044146716451, + "grad_norm": 2.249570846557617, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.711786150932312, + "num_tokens": 170453186.0, + "step": 6723 + }, + { + "epoch": 0.7384142323742587, + "grad_norm": 2.3193814754486084, + "learning_rate": 1e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.712051272392273, + "num_tokens": 170476526.0, + "step": 6724 + }, + { + "epoch": 0.7385240500768724, + "grad_norm": 2.380070447921753, + "learning_rate": 1e-06, + "loss": 0.8223, + "mean_token_accuracy": 0.7417225241661072, + "num_tokens": 170494967.0, + "step": 6725 + }, + { + "epoch": 0.738633867779486, + "grad_norm": 2.2082533836364746, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.6980090141296387, + "num_tokens": 170520710.0, + "step": 6726 + }, + { + "epoch": 0.7387436854820997, + "grad_norm": 2.2898683547973633, + "learning_rate": 1e-06, + "loss": 0.8733, + "mean_token_accuracy": 0.7219407558441162, + "num_tokens": 170543390.0, + "step": 6727 + }, + { + "epoch": 0.7388535031847133, + "grad_norm": 1.9496185779571533, + "learning_rate": 1e-06, + "loss": 0.9446, + "mean_token_accuracy": 0.7168665528297424, + "num_tokens": 170573611.0, + "step": 6728 + }, + { + "epoch": 0.7389633208873271, + "grad_norm": 2.579925537109375, + "learning_rate": 1e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.7199199199676514, + "num_tokens": 170592981.0, + "step": 6729 + }, + { + "epoch": 0.7390731385899407, + "grad_norm": 2.140291452407837, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.703588604927063, + "num_tokens": 170619625.0, + "step": 6730 + }, + { + "epoch": 0.7391829562925544, + "grad_norm": 2.1193196773529053, + "learning_rate": 1e-06, + "loss": 0.9239, + "mean_token_accuracy": 0.7126034498214722, + "num_tokens": 170645209.0, + "step": 6731 + }, + { + "epoch": 0.739292773995168, + "grad_norm": 2.4911856651306152, + "learning_rate": 1e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.7092093825340271, + "num_tokens": 170665831.0, + "step": 6732 + }, + { + "epoch": 0.7394025916977817, + "grad_norm": 2.3188610076904297, + "learning_rate": 1e-06, + "loss": 0.8661, + "mean_token_accuracy": 0.7267286777496338, + "num_tokens": 170688659.0, + "step": 6733 + }, + { + "epoch": 0.7395124094003953, + "grad_norm": 2.3127334117889404, + "learning_rate": 1e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.6995970010757446, + "num_tokens": 170713140.0, + "step": 6734 + }, + { + "epoch": 0.739622227103009, + "grad_norm": 2.1597423553466797, + "learning_rate": 1e-06, + "loss": 0.8557, + "mean_token_accuracy": 0.7312394380569458, + "num_tokens": 170739302.0, + "step": 6735 + }, + { + "epoch": 0.7397320448056227, + "grad_norm": 2.264185667037964, + "learning_rate": 1e-06, + "loss": 1.0195, + "mean_token_accuracy": 0.6932104229927063, + "num_tokens": 170767732.0, + "step": 6736 + }, + { + "epoch": 0.7398418625082364, + "grad_norm": 2.705923080444336, + "learning_rate": 1e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.709083080291748, + "num_tokens": 170785900.0, + "step": 6737 + }, + { + "epoch": 0.73995168021085, + "grad_norm": 2.327343702316284, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7096911668777466, + "num_tokens": 170808447.0, + "step": 6738 + }, + { + "epoch": 0.7400614979134637, + "grad_norm": 2.247471570968628, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7120226621627808, + "num_tokens": 170834641.0, + "step": 6739 + }, + { + "epoch": 0.7401713156160773, + "grad_norm": 2.1960079669952393, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7205859422683716, + "num_tokens": 170860426.0, + "step": 6740 + }, + { + "epoch": 0.7402811333186909, + "grad_norm": 2.361851692199707, + "learning_rate": 1e-06, + "loss": 0.8582, + "mean_token_accuracy": 0.7465275526046753, + "num_tokens": 170881776.0, + "step": 6741 + }, + { + "epoch": 0.7403909510213046, + "grad_norm": 2.497398853302002, + "learning_rate": 1e-06, + "loss": 0.8346, + "mean_token_accuracy": 0.7513656616210938, + "num_tokens": 170901461.0, + "step": 6742 + }, + { + "epoch": 0.7405007687239183, + "grad_norm": 1.9960352182388306, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7106592655181885, + "num_tokens": 170931063.0, + "step": 6743 + }, + { + "epoch": 0.740610586426532, + "grad_norm": 1.9922668933868408, + "learning_rate": 1e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.6975274085998535, + "num_tokens": 170962036.0, + "step": 6744 + }, + { + "epoch": 0.7407204041291456, + "grad_norm": 2.436312675476074, + "learning_rate": 1e-06, + "loss": 0.8571, + "mean_token_accuracy": 0.7262861132621765, + "num_tokens": 170980605.0, + "step": 6745 + }, + { + "epoch": 0.7408302218317593, + "grad_norm": 2.2662880420684814, + "learning_rate": 1e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.7154802083969116, + "num_tokens": 171005602.0, + "step": 6746 + }, + { + "epoch": 0.7409400395343729, + "grad_norm": 2.238954782485962, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7085468769073486, + "num_tokens": 171031098.0, + "step": 6747 + }, + { + "epoch": 0.7410498572369866, + "grad_norm": 2.019359827041626, + "learning_rate": 1e-06, + "loss": 1.0468, + "mean_token_accuracy": 0.6828824281692505, + "num_tokens": 171062305.0, + "step": 6748 + }, + { + "epoch": 0.7411596749396002, + "grad_norm": 2.075385332107544, + "learning_rate": 1e-06, + "loss": 1.0186, + "mean_token_accuracy": 0.6912649869918823, + "num_tokens": 171091618.0, + "step": 6749 + }, + { + "epoch": 0.741269492642214, + "grad_norm": 2.351564407348633, + "learning_rate": 1e-06, + "loss": 1.021, + "mean_token_accuracy": 0.7045543789863586, + "num_tokens": 171113365.0, + "step": 6750 + }, + { + "epoch": 0.7413793103448276, + "grad_norm": 2.337146043777466, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.6940557360649109, + "num_tokens": 171136731.0, + "step": 6751 + }, + { + "epoch": 0.7414891280474413, + "grad_norm": 2.3320891857147217, + "learning_rate": 1e-06, + "loss": 0.8485, + "mean_token_accuracy": 0.7370116710662842, + "num_tokens": 171157974.0, + "step": 6752 + }, + { + "epoch": 0.7415989457500549, + "grad_norm": 2.378859043121338, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.7075685858726501, + "num_tokens": 171179546.0, + "step": 6753 + }, + { + "epoch": 0.7417087634526686, + "grad_norm": 2.555691957473755, + "learning_rate": 1e-06, + "loss": 1.034, + "mean_token_accuracy": 0.6854642629623413, + "num_tokens": 171202108.0, + "step": 6754 + }, + { + "epoch": 0.7418185811552822, + "grad_norm": 2.1339471340179443, + "learning_rate": 1e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.7030735015869141, + "num_tokens": 171227825.0, + "step": 6755 + }, + { + "epoch": 0.7419283988578959, + "grad_norm": 2.099268674850464, + "learning_rate": 1e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.7146088480949402, + "num_tokens": 171254077.0, + "step": 6756 + }, + { + "epoch": 0.7420382165605095, + "grad_norm": 2.109240770339966, + "learning_rate": 1e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.696860671043396, + "num_tokens": 171281851.0, + "step": 6757 + }, + { + "epoch": 0.7421480342631233, + "grad_norm": 2.2058663368225098, + "learning_rate": 1e-06, + "loss": 1.0138, + "mean_token_accuracy": 0.6875157356262207, + "num_tokens": 171307979.0, + "step": 6758 + }, + { + "epoch": 0.7422578519657369, + "grad_norm": 2.1849472522735596, + "learning_rate": 1e-06, + "loss": 1.0158, + "mean_token_accuracy": 0.68636554479599, + "num_tokens": 171333480.0, + "step": 6759 + }, + { + "epoch": 0.7423676696683505, + "grad_norm": 2.0924432277679443, + "learning_rate": 1e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.7049497365951538, + "num_tokens": 171360627.0, + "step": 6760 + }, + { + "epoch": 0.7424774873709642, + "grad_norm": 2.4948530197143555, + "learning_rate": 1e-06, + "loss": 1.0077, + "mean_token_accuracy": 0.7022609710693359, + "num_tokens": 171381901.0, + "step": 6761 + }, + { + "epoch": 0.7425873050735778, + "grad_norm": 2.0076167583465576, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.6993958353996277, + "num_tokens": 171410600.0, + "step": 6762 + }, + { + "epoch": 0.7426971227761915, + "grad_norm": 2.071349859237671, + "learning_rate": 1e-06, + "loss": 0.9799, + "mean_token_accuracy": 0.7024694681167603, + "num_tokens": 171437146.0, + "step": 6763 + }, + { + "epoch": 0.7428069404788051, + "grad_norm": 2.2326889038085938, + "learning_rate": 1e-06, + "loss": 1.0293, + "mean_token_accuracy": 0.6844877004623413, + "num_tokens": 171462965.0, + "step": 6764 + }, + { + "epoch": 0.7429167581814189, + "grad_norm": 2.1187188625335693, + "learning_rate": 1e-06, + "loss": 1.0734, + "mean_token_accuracy": 0.678275465965271, + "num_tokens": 171491265.0, + "step": 6765 + }, + { + "epoch": 0.7430265758840325, + "grad_norm": 2.2030234336853027, + "learning_rate": 1e-06, + "loss": 0.934, + "mean_token_accuracy": 0.7098609209060669, + "num_tokens": 171515561.0, + "step": 6766 + }, + { + "epoch": 0.7431363935866462, + "grad_norm": 2.2726621627807617, + "learning_rate": 1e-06, + "loss": 1.0133, + "mean_token_accuracy": 0.6920316219329834, + "num_tokens": 171543460.0, + "step": 6767 + }, + { + "epoch": 0.7432462112892598, + "grad_norm": 2.1026229858398438, + "learning_rate": 1e-06, + "loss": 0.9933, + "mean_token_accuracy": 0.6956894397735596, + "num_tokens": 171571098.0, + "step": 6768 + }, + { + "epoch": 0.7433560289918735, + "grad_norm": 2.275211811065674, + "learning_rate": 1e-06, + "loss": 0.9041, + "mean_token_accuracy": 0.7302625179290771, + "num_tokens": 171594007.0, + "step": 6769 + }, + { + "epoch": 0.7434658466944871, + "grad_norm": 2.2878010272979736, + "learning_rate": 1e-06, + "loss": 1.0111, + "mean_token_accuracy": 0.6943938136100769, + "num_tokens": 171618746.0, + "step": 6770 + }, + { + "epoch": 0.7435756643971008, + "grad_norm": 2.2830684185028076, + "learning_rate": 1e-06, + "loss": 0.8992, + "mean_token_accuracy": 0.7251777052879333, + "num_tokens": 171642103.0, + "step": 6771 + }, + { + "epoch": 0.7436854820997145, + "grad_norm": 2.0578360557556152, + "learning_rate": 1e-06, + "loss": 0.8637, + "mean_token_accuracy": 0.728898286819458, + "num_tokens": 171668147.0, + "step": 6772 + }, + { + "epoch": 0.7437952998023282, + "grad_norm": 2.212669849395752, + "learning_rate": 1e-06, + "loss": 0.8835, + "mean_token_accuracy": 0.7249250411987305, + "num_tokens": 171691590.0, + "step": 6773 + }, + { + "epoch": 0.7439051175049418, + "grad_norm": 2.0036070346832275, + "learning_rate": 1e-06, + "loss": 1.002, + "mean_token_accuracy": 0.7100690007209778, + "num_tokens": 171720859.0, + "step": 6774 + }, + { + "epoch": 0.7440149352075555, + "grad_norm": 2.1827573776245117, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.7093328833580017, + "num_tokens": 171745589.0, + "step": 6775 + }, + { + "epoch": 0.7441247529101691, + "grad_norm": 2.1876955032348633, + "learning_rate": 1e-06, + "loss": 1.0219, + "mean_token_accuracy": 0.7006354331970215, + "num_tokens": 171770915.0, + "step": 6776 + }, + { + "epoch": 0.7442345706127828, + "grad_norm": 2.294750452041626, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.70965576171875, + "num_tokens": 171793724.0, + "step": 6777 + }, + { + "epoch": 0.7443443883153964, + "grad_norm": 2.4986581802368164, + "learning_rate": 1e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.7088533639907837, + "num_tokens": 171813914.0, + "step": 6778 + }, + { + "epoch": 0.7444542060180102, + "grad_norm": 2.429206132888794, + "learning_rate": 1e-06, + "loss": 0.9032, + "mean_token_accuracy": 0.7174621820449829, + "num_tokens": 171833740.0, + "step": 6779 + }, + { + "epoch": 0.7445640237206238, + "grad_norm": 2.210744857788086, + "learning_rate": 1e-06, + "loss": 0.8553, + "mean_token_accuracy": 0.7338846325874329, + "num_tokens": 171858173.0, + "step": 6780 + }, + { + "epoch": 0.7446738414232374, + "grad_norm": 2.1857423782348633, + "learning_rate": 1e-06, + "loss": 0.9704, + "mean_token_accuracy": 0.7027300000190735, + "num_tokens": 171884920.0, + "step": 6781 + }, + { + "epoch": 0.7447836591258511, + "grad_norm": 2.336886167526245, + "learning_rate": 1e-06, + "loss": 1.0327, + "mean_token_accuracy": 0.6880970597267151, + "num_tokens": 171907214.0, + "step": 6782 + }, + { + "epoch": 0.7448934768284647, + "grad_norm": 2.656616687774658, + "learning_rate": 1e-06, + "loss": 0.8645, + "mean_token_accuracy": 0.7308954000473022, + "num_tokens": 171925724.0, + "step": 6783 + }, + { + "epoch": 0.7450032945310784, + "grad_norm": 2.2270994186401367, + "learning_rate": 1e-06, + "loss": 0.998, + "mean_token_accuracy": 0.6936136484146118, + "num_tokens": 171951093.0, + "step": 6784 + }, + { + "epoch": 0.745113112233692, + "grad_norm": 2.140620470046997, + "learning_rate": 1e-06, + "loss": 1.0016, + "mean_token_accuracy": 0.6980913877487183, + "num_tokens": 171977285.0, + "step": 6785 + }, + { + "epoch": 0.7452229299363057, + "grad_norm": 2.2080464363098145, + "learning_rate": 1e-06, + "loss": 1.0282, + "mean_token_accuracy": 0.6945510506629944, + "num_tokens": 172004411.0, + "step": 6786 + }, + { + "epoch": 0.7453327476389194, + "grad_norm": 2.6358094215393066, + "learning_rate": 1e-06, + "loss": 0.9782, + "mean_token_accuracy": 0.700937032699585, + "num_tokens": 172023969.0, + "step": 6787 + }, + { + "epoch": 0.7454425653415331, + "grad_norm": 1.936139464378357, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.6937168836593628, + "num_tokens": 172055483.0, + "step": 6788 + }, + { + "epoch": 0.7455523830441467, + "grad_norm": 2.218303918838501, + "learning_rate": 1e-06, + "loss": 0.9629, + "mean_token_accuracy": 0.7023522853851318, + "num_tokens": 172079563.0, + "step": 6789 + }, + { + "epoch": 0.7456622007467604, + "grad_norm": 2.540072441101074, + "learning_rate": 1e-06, + "loss": 0.89, + "mean_token_accuracy": 0.7220735549926758, + "num_tokens": 172098593.0, + "step": 6790 + }, + { + "epoch": 0.745772018449374, + "grad_norm": 2.559490203857422, + "learning_rate": 1e-06, + "loss": 0.875, + "mean_token_accuracy": 0.7270229458808899, + "num_tokens": 172117665.0, + "step": 6791 + }, + { + "epoch": 0.7458818361519877, + "grad_norm": 2.0871083736419678, + "learning_rate": 1e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.7092962265014648, + "num_tokens": 172145517.0, + "step": 6792 + }, + { + "epoch": 0.7459916538546013, + "grad_norm": 1.8846166133880615, + "learning_rate": 1e-06, + "loss": 0.9983, + "mean_token_accuracy": 0.6969943046569824, + "num_tokens": 172180095.0, + "step": 6793 + }, + { + "epoch": 0.7461014715572151, + "grad_norm": 2.1668784618377686, + "learning_rate": 1e-06, + "loss": 1.009, + "mean_token_accuracy": 0.6913483738899231, + "num_tokens": 172207522.0, + "step": 6794 + }, + { + "epoch": 0.7462112892598287, + "grad_norm": 2.426344156265259, + "learning_rate": 1e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.6920335292816162, + "num_tokens": 172231526.0, + "step": 6795 + }, + { + "epoch": 0.7463211069624424, + "grad_norm": 2.428463935852051, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7181052565574646, + "num_tokens": 172254567.0, + "step": 6796 + }, + { + "epoch": 0.746430924665056, + "grad_norm": 2.1056244373321533, + "learning_rate": 1e-06, + "loss": 0.7739, + "mean_token_accuracy": 0.7489711046218872, + "num_tokens": 172279376.0, + "step": 6797 + }, + { + "epoch": 0.7465407423676697, + "grad_norm": 2.2187089920043945, + "learning_rate": 1e-06, + "loss": 0.8777, + "mean_token_accuracy": 0.7335999011993408, + "num_tokens": 172302721.0, + "step": 6798 + }, + { + "epoch": 0.7466505600702833, + "grad_norm": 2.2024612426757812, + "learning_rate": 1e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7199759483337402, + "num_tokens": 172330381.0, + "step": 6799 + }, + { + "epoch": 0.7467603777728969, + "grad_norm": 2.236675977706909, + "learning_rate": 1e-06, + "loss": 0.8611, + "mean_token_accuracy": 0.7320104837417603, + "num_tokens": 172354010.0, + "step": 6800 + }, + { + "epoch": 0.7468701954755107, + "grad_norm": 2.34051251411438, + "learning_rate": 1e-06, + "loss": 1.0003, + "mean_token_accuracy": 0.7014005780220032, + "num_tokens": 172376485.0, + "step": 6801 + }, + { + "epoch": 0.7469800131781243, + "grad_norm": 2.4281742572784424, + "learning_rate": 1e-06, + "loss": 0.9865, + "mean_token_accuracy": 0.7043849229812622, + "num_tokens": 172399993.0, + "step": 6802 + }, + { + "epoch": 0.747089830880738, + "grad_norm": 2.1660547256469727, + "learning_rate": 1e-06, + "loss": 0.8648, + "mean_token_accuracy": 0.7271496653556824, + "num_tokens": 172424135.0, + "step": 6803 + }, + { + "epoch": 0.7471996485833516, + "grad_norm": 2.556755781173706, + "learning_rate": 1e-06, + "loss": 0.9227, + "mean_token_accuracy": 0.7174586057662964, + "num_tokens": 172444620.0, + "step": 6804 + }, + { + "epoch": 0.7473094662859653, + "grad_norm": 2.2512927055358887, + "learning_rate": 1e-06, + "loss": 0.9787, + "mean_token_accuracy": 0.6972065567970276, + "num_tokens": 172468635.0, + "step": 6805 + }, + { + "epoch": 0.7474192839885789, + "grad_norm": 2.419990062713623, + "learning_rate": 1e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.7108020782470703, + "num_tokens": 172491157.0, + "step": 6806 + }, + { + "epoch": 0.7475291016911926, + "grad_norm": 2.0819482803344727, + "learning_rate": 1e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.6997228860855103, + "num_tokens": 172519479.0, + "step": 6807 + }, + { + "epoch": 0.7476389193938063, + "grad_norm": 2.2266149520874023, + "learning_rate": 1e-06, + "loss": 1.0455, + "mean_token_accuracy": 0.6812486052513123, + "num_tokens": 172545543.0, + "step": 6808 + }, + { + "epoch": 0.74774873709642, + "grad_norm": 2.2139158248901367, + "learning_rate": 1e-06, + "loss": 0.903, + "mean_token_accuracy": 0.7272831201553345, + "num_tokens": 172570338.0, + "step": 6809 + }, + { + "epoch": 0.7478585547990336, + "grad_norm": 2.021282911300659, + "learning_rate": 1e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.7104600071907043, + "num_tokens": 172599860.0, + "step": 6810 + }, + { + "epoch": 0.7479683725016473, + "grad_norm": 2.4600396156311035, + "learning_rate": 1e-06, + "loss": 0.8439, + "mean_token_accuracy": 0.7370235323905945, + "num_tokens": 172619576.0, + "step": 6811 + }, + { + "epoch": 0.7480781902042609, + "grad_norm": 2.3662071228027344, + "learning_rate": 1e-06, + "loss": 0.8778, + "mean_token_accuracy": 0.722440242767334, + "num_tokens": 172641244.0, + "step": 6812 + }, + { + "epoch": 0.7481880079068746, + "grad_norm": 2.255617618560791, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7064353227615356, + "num_tokens": 172668415.0, + "step": 6813 + }, + { + "epoch": 0.7482978256094882, + "grad_norm": 2.1109230518341064, + "learning_rate": 1e-06, + "loss": 0.887, + "mean_token_accuracy": 0.7178915143013, + "num_tokens": 172694676.0, + "step": 6814 + }, + { + "epoch": 0.7484076433121019, + "grad_norm": 2.0424764156341553, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7097126841545105, + "num_tokens": 172722597.0, + "step": 6815 + }, + { + "epoch": 0.7485174610147156, + "grad_norm": 2.4184329509735107, + "learning_rate": 1e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.702837347984314, + "num_tokens": 172744500.0, + "step": 6816 + }, + { + "epoch": 0.7486272787173293, + "grad_norm": 2.408214569091797, + "learning_rate": 1e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.6945475935935974, + "num_tokens": 172766404.0, + "step": 6817 + }, + { + "epoch": 0.7487370964199429, + "grad_norm": 1.9777765274047852, + "learning_rate": 1e-06, + "loss": 1.0318, + "mean_token_accuracy": 0.6943869590759277, + "num_tokens": 172795338.0, + "step": 6818 + }, + { + "epoch": 0.7488469141225566, + "grad_norm": 2.2679965496063232, + "learning_rate": 1e-06, + "loss": 0.9174, + "mean_token_accuracy": 0.7184737920761108, + "num_tokens": 172817022.0, + "step": 6819 + }, + { + "epoch": 0.7489567318251702, + "grad_norm": 2.4048995971679688, + "learning_rate": 1e-06, + "loss": 0.9485, + "mean_token_accuracy": 0.7058835625648499, + "num_tokens": 172839055.0, + "step": 6820 + }, + { + "epoch": 0.7490665495277838, + "grad_norm": 2.5003597736358643, + "learning_rate": 1e-06, + "loss": 0.9628, + "mean_token_accuracy": 0.7029983401298523, + "num_tokens": 172860286.0, + "step": 6821 + }, + { + "epoch": 0.7491763672303975, + "grad_norm": 2.170741081237793, + "learning_rate": 1e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.690671443939209, + "num_tokens": 172890237.0, + "step": 6822 + }, + { + "epoch": 0.7492861849330112, + "grad_norm": 2.3522064685821533, + "learning_rate": 1e-06, + "loss": 1.0006, + "mean_token_accuracy": 0.6942024230957031, + "num_tokens": 172913497.0, + "step": 6823 + }, + { + "epoch": 0.7493960026356249, + "grad_norm": 2.0568885803222656, + "learning_rate": 1e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.7167010307312012, + "num_tokens": 172940723.0, + "step": 6824 + }, + { + "epoch": 0.7495058203382385, + "grad_norm": 2.363967180252075, + "learning_rate": 1e-06, + "loss": 0.9055, + "mean_token_accuracy": 0.7175012826919556, + "num_tokens": 172964331.0, + "step": 6825 + }, + { + "epoch": 0.7496156380408522, + "grad_norm": 2.2008473873138428, + "learning_rate": 1e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.7059308290481567, + "num_tokens": 172991347.0, + "step": 6826 + }, + { + "epoch": 0.7497254557434658, + "grad_norm": 2.2182579040527344, + "learning_rate": 1e-06, + "loss": 0.9497, + "mean_token_accuracy": 0.7099463939666748, + "num_tokens": 173017745.0, + "step": 6827 + }, + { + "epoch": 0.7498352734460795, + "grad_norm": 2.158498525619507, + "learning_rate": 1e-06, + "loss": 0.9799, + "mean_token_accuracy": 0.702451229095459, + "num_tokens": 173044184.0, + "step": 6828 + }, + { + "epoch": 0.7499450911486931, + "grad_norm": 2.0272393226623535, + "learning_rate": 1e-06, + "loss": 0.9064, + "mean_token_accuracy": 0.7182096242904663, + "num_tokens": 173071779.0, + "step": 6829 + }, + { + "epoch": 0.7500549088513069, + "grad_norm": 2.050976514816284, + "learning_rate": 1e-06, + "loss": 0.8982, + "mean_token_accuracy": 0.7257393598556519, + "num_tokens": 173098714.0, + "step": 6830 + }, + { + "epoch": 0.7501647265539205, + "grad_norm": 2.3498921394348145, + "learning_rate": 1e-06, + "loss": 1.0348, + "mean_token_accuracy": 0.6816742420196533, + "num_tokens": 173121646.0, + "step": 6831 + }, + { + "epoch": 0.7502745442565342, + "grad_norm": 1.988250494003296, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.7086181640625, + "num_tokens": 173149550.0, + "step": 6832 + }, + { + "epoch": 0.7503843619591478, + "grad_norm": 2.310767412185669, + "learning_rate": 1e-06, + "loss": 0.9679, + "mean_token_accuracy": 0.7098502516746521, + "num_tokens": 173172442.0, + "step": 6833 + }, + { + "epoch": 0.7504941796617615, + "grad_norm": 2.232295036315918, + "learning_rate": 1e-06, + "loss": 0.8588, + "mean_token_accuracy": 0.7348494529724121, + "num_tokens": 173196047.0, + "step": 6834 + }, + { + "epoch": 0.7506039973643751, + "grad_norm": 2.342092514038086, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7071981430053711, + "num_tokens": 173218124.0, + "step": 6835 + }, + { + "epoch": 0.7507138150669888, + "grad_norm": 2.1841979026794434, + "learning_rate": 1e-06, + "loss": 1.0312, + "mean_token_accuracy": 0.6915477514266968, + "num_tokens": 173246764.0, + "step": 6836 + }, + { + "epoch": 0.7508236327696025, + "grad_norm": 2.2574262619018555, + "learning_rate": 1e-06, + "loss": 1.018, + "mean_token_accuracy": 0.6938159465789795, + "num_tokens": 173271980.0, + "step": 6837 + }, + { + "epoch": 0.7509334504722162, + "grad_norm": 2.0990123748779297, + "learning_rate": 1e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7198816537857056, + "num_tokens": 173300168.0, + "step": 6838 + }, + { + "epoch": 0.7510432681748298, + "grad_norm": 1.973642110824585, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7101374864578247, + "num_tokens": 173332619.0, + "step": 6839 + }, + { + "epoch": 0.7511530858774434, + "grad_norm": 2.1301660537719727, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7106094360351562, + "num_tokens": 173357014.0, + "step": 6840 + }, + { + "epoch": 0.7512629035800571, + "grad_norm": 2.2960660457611084, + "learning_rate": 1e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.7124453186988831, + "num_tokens": 173379451.0, + "step": 6841 + }, + { + "epoch": 0.7513727212826707, + "grad_norm": 2.314513921737671, + "learning_rate": 1e-06, + "loss": 0.8709, + "mean_token_accuracy": 0.7290195226669312, + "num_tokens": 173402093.0, + "step": 6842 + }, + { + "epoch": 0.7514825389852844, + "grad_norm": 1.8428987264633179, + "learning_rate": 1e-06, + "loss": 0.949, + "mean_token_accuracy": 0.706237256526947, + "num_tokens": 173436075.0, + "step": 6843 + }, + { + "epoch": 0.7515923566878981, + "grad_norm": 2.3400444984436035, + "learning_rate": 1e-06, + "loss": 1.0012, + "mean_token_accuracy": 0.7090175151824951, + "num_tokens": 173459762.0, + "step": 6844 + }, + { + "epoch": 0.7517021743905118, + "grad_norm": 2.1579577922821045, + "learning_rate": 1e-06, + "loss": 1.004, + "mean_token_accuracy": 0.708519458770752, + "num_tokens": 173486237.0, + "step": 6845 + }, + { + "epoch": 0.7518119920931254, + "grad_norm": 1.8330496549606323, + "learning_rate": 1e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7135065793991089, + "num_tokens": 173518814.0, + "step": 6846 + }, + { + "epoch": 0.7519218097957391, + "grad_norm": 2.05104660987854, + "learning_rate": 1e-06, + "loss": 1.0025, + "mean_token_accuracy": 0.6945561766624451, + "num_tokens": 173547838.0, + "step": 6847 + }, + { + "epoch": 0.7520316274983527, + "grad_norm": 2.076986074447632, + "learning_rate": 1e-06, + "loss": 0.9766, + "mean_token_accuracy": 0.7028590440750122, + "num_tokens": 173575606.0, + "step": 6848 + }, + { + "epoch": 0.7521414452009664, + "grad_norm": 1.9139034748077393, + "learning_rate": 1e-06, + "loss": 1.0633, + "mean_token_accuracy": 0.683485746383667, + "num_tokens": 173607614.0, + "step": 6849 + }, + { + "epoch": 0.75225126290358, + "grad_norm": 2.007582664489746, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7025022506713867, + "num_tokens": 173636252.0, + "step": 6850 + }, + { + "epoch": 0.7523610806061937, + "grad_norm": 2.0195047855377197, + "learning_rate": 1e-06, + "loss": 1.0361, + "mean_token_accuracy": 0.684494137763977, + "num_tokens": 173665625.0, + "step": 6851 + }, + { + "epoch": 0.7524708983088074, + "grad_norm": 2.2024500370025635, + "learning_rate": 1e-06, + "loss": 1.0082, + "mean_token_accuracy": 0.6903069019317627, + "num_tokens": 173691783.0, + "step": 6852 + }, + { + "epoch": 0.7525807160114211, + "grad_norm": 2.3506104946136475, + "learning_rate": 1e-06, + "loss": 1.0444, + "mean_token_accuracy": 0.7106529474258423, + "num_tokens": 173716652.0, + "step": 6853 + }, + { + "epoch": 0.7526905337140347, + "grad_norm": 2.1213274002075195, + "learning_rate": 1e-06, + "loss": 1.0005, + "mean_token_accuracy": 0.7028261423110962, + "num_tokens": 173742151.0, + "step": 6854 + }, + { + "epoch": 0.7528003514166484, + "grad_norm": 2.05706524848938, + "learning_rate": 1e-06, + "loss": 0.8932, + "mean_token_accuracy": 0.7238057851791382, + "num_tokens": 173768959.0, + "step": 6855 + }, + { + "epoch": 0.752910169119262, + "grad_norm": 2.104743719100952, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7101596593856812, + "num_tokens": 173797912.0, + "step": 6856 + }, + { + "epoch": 0.7530199868218757, + "grad_norm": 2.226608991622925, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7011011838912964, + "num_tokens": 173820973.0, + "step": 6857 + }, + { + "epoch": 0.7531298045244893, + "grad_norm": 2.1496479511260986, + "learning_rate": 1e-06, + "loss": 1.0529, + "mean_token_accuracy": 0.6776223182678223, + "num_tokens": 173848116.0, + "step": 6858 + }, + { + "epoch": 0.7532396222271031, + "grad_norm": 1.923933744430542, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.7078979015350342, + "num_tokens": 173878250.0, + "step": 6859 + }, + { + "epoch": 0.7533494399297167, + "grad_norm": 2.2907605171203613, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7100704908370972, + "num_tokens": 173901124.0, + "step": 6860 + }, + { + "epoch": 0.7534592576323303, + "grad_norm": 2.3157105445861816, + "learning_rate": 1e-06, + "loss": 1.0334, + "mean_token_accuracy": 0.692989706993103, + "num_tokens": 173926364.0, + "step": 6861 + }, + { + "epoch": 0.753569075334944, + "grad_norm": 2.172027826309204, + "learning_rate": 1e-06, + "loss": 0.9441, + "mean_token_accuracy": 0.7110331058502197, + "num_tokens": 173954287.0, + "step": 6862 + }, + { + "epoch": 0.7536788930375576, + "grad_norm": 1.9129494428634644, + "learning_rate": 1e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.7226369976997375, + "num_tokens": 173985428.0, + "step": 6863 + }, + { + "epoch": 0.7537887107401713, + "grad_norm": 2.2004952430725098, + "learning_rate": 1e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.6948319673538208, + "num_tokens": 174010630.0, + "step": 6864 + }, + { + "epoch": 0.7538985284427849, + "grad_norm": 2.057339668273926, + "learning_rate": 1e-06, + "loss": 1.0186, + "mean_token_accuracy": 0.7092453837394714, + "num_tokens": 174037504.0, + "step": 6865 + }, + { + "epoch": 0.7540083461453987, + "grad_norm": 2.096790313720703, + "learning_rate": 1e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.7190715074539185, + "num_tokens": 174064789.0, + "step": 6866 + }, + { + "epoch": 0.7541181638480123, + "grad_norm": 2.513540506362915, + "learning_rate": 1e-06, + "loss": 0.9446, + "mean_token_accuracy": 0.7048360109329224, + "num_tokens": 174085562.0, + "step": 6867 + }, + { + "epoch": 0.754227981550626, + "grad_norm": 2.147362470626831, + "learning_rate": 1e-06, + "loss": 0.8876, + "mean_token_accuracy": 0.7263545989990234, + "num_tokens": 174110149.0, + "step": 6868 + }, + { + "epoch": 0.7543377992532396, + "grad_norm": 2.0302367210388184, + "learning_rate": 1e-06, + "loss": 0.961, + "mean_token_accuracy": 0.7042748928070068, + "num_tokens": 174139950.0, + "step": 6869 + }, + { + "epoch": 0.7544476169558533, + "grad_norm": 1.9637882709503174, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7051783800125122, + "num_tokens": 174168972.0, + "step": 6870 + }, + { + "epoch": 0.7545574346584669, + "grad_norm": 2.538562059402466, + "learning_rate": 1e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7221338152885437, + "num_tokens": 174188117.0, + "step": 6871 + }, + { + "epoch": 0.7546672523610806, + "grad_norm": 2.2693395614624023, + "learning_rate": 1e-06, + "loss": 0.9441, + "mean_token_accuracy": 0.7114068269729614, + "num_tokens": 174214089.0, + "step": 6872 + }, + { + "epoch": 0.7547770700636943, + "grad_norm": 2.1821908950805664, + "learning_rate": 1e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7076665759086609, + "num_tokens": 174239856.0, + "step": 6873 + }, + { + "epoch": 0.754886887766308, + "grad_norm": 1.9894622564315796, + "learning_rate": 1e-06, + "loss": 0.9356, + "mean_token_accuracy": 0.7166120409965515, + "num_tokens": 174268089.0, + "step": 6874 + }, + { + "epoch": 0.7549967054689216, + "grad_norm": 2.2634811401367188, + "learning_rate": 1e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.7038841247558594, + "num_tokens": 174291645.0, + "step": 6875 + }, + { + "epoch": 0.7551065231715353, + "grad_norm": 2.2970666885375977, + "learning_rate": 1e-06, + "loss": 1.0219, + "mean_token_accuracy": 0.6986875534057617, + "num_tokens": 174317327.0, + "step": 6876 + }, + { + "epoch": 0.7552163408741489, + "grad_norm": 2.450582265853882, + "learning_rate": 1e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.7013553380966187, + "num_tokens": 174337989.0, + "step": 6877 + }, + { + "epoch": 0.7553261585767626, + "grad_norm": 2.320911169052124, + "learning_rate": 1e-06, + "loss": 1.0218, + "mean_token_accuracy": 0.6857826709747314, + "num_tokens": 174362858.0, + "step": 6878 + }, + { + "epoch": 0.7554359762793762, + "grad_norm": 1.9712549448013306, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7215973138809204, + "num_tokens": 174393974.0, + "step": 6879 + }, + { + "epoch": 0.7555457939819898, + "grad_norm": 2.766968011856079, + "learning_rate": 1e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.7203612327575684, + "num_tokens": 174410063.0, + "step": 6880 + }, + { + "epoch": 0.7556556116846036, + "grad_norm": 2.1504440307617188, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7028652429580688, + "num_tokens": 174437781.0, + "step": 6881 + }, + { + "epoch": 0.7557654293872172, + "grad_norm": 2.0237879753112793, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7165126800537109, + "num_tokens": 174465090.0, + "step": 6882 + }, + { + "epoch": 0.7558752470898309, + "grad_norm": 2.2563576698303223, + "learning_rate": 1e-06, + "loss": 0.9976, + "mean_token_accuracy": 0.7037219405174255, + "num_tokens": 174491381.0, + "step": 6883 + }, + { + "epoch": 0.7559850647924445, + "grad_norm": 2.1236777305603027, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.7025207281112671, + "num_tokens": 174518107.0, + "step": 6884 + }, + { + "epoch": 0.7560948824950582, + "grad_norm": 2.456650733947754, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7172905206680298, + "num_tokens": 174538245.0, + "step": 6885 + }, + { + "epoch": 0.7562047001976718, + "grad_norm": 1.8969316482543945, + "learning_rate": 1e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.695451021194458, + "num_tokens": 174570733.0, + "step": 6886 + }, + { + "epoch": 0.7563145179002855, + "grad_norm": 2.20383358001709, + "learning_rate": 1e-06, + "loss": 0.932, + "mean_token_accuracy": 0.7211587429046631, + "num_tokens": 174594223.0, + "step": 6887 + }, + { + "epoch": 0.7564243356028992, + "grad_norm": 2.0129342079162598, + "learning_rate": 1e-06, + "loss": 1.033, + "mean_token_accuracy": 0.6982444524765015, + "num_tokens": 174624020.0, + "step": 6888 + }, + { + "epoch": 0.7565341533055129, + "grad_norm": 2.0473430156707764, + "learning_rate": 1e-06, + "loss": 1.0132, + "mean_token_accuracy": 0.6923545002937317, + "num_tokens": 174652200.0, + "step": 6889 + }, + { + "epoch": 0.7566439710081265, + "grad_norm": 2.3234779834747314, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7077934741973877, + "num_tokens": 174676632.0, + "step": 6890 + }, + { + "epoch": 0.7567537887107402, + "grad_norm": 1.9537458419799805, + "learning_rate": 1e-06, + "loss": 0.8573, + "mean_token_accuracy": 0.7290925979614258, + "num_tokens": 174705179.0, + "step": 6891 + }, + { + "epoch": 0.7568636064133538, + "grad_norm": 2.442307949066162, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7059169411659241, + "num_tokens": 174725002.0, + "step": 6892 + }, + { + "epoch": 0.7569734241159675, + "grad_norm": 2.372844696044922, + "learning_rate": 1e-06, + "loss": 0.9869, + "mean_token_accuracy": 0.7021788954734802, + "num_tokens": 174747703.0, + "step": 6893 + }, + { + "epoch": 0.7570832418185811, + "grad_norm": 2.52388334274292, + "learning_rate": 1e-06, + "loss": 0.8805, + "mean_token_accuracy": 0.7201954126358032, + "num_tokens": 174766479.0, + "step": 6894 + }, + { + "epoch": 0.7571930595211949, + "grad_norm": 2.1462149620056152, + "learning_rate": 1e-06, + "loss": 0.9045, + "mean_token_accuracy": 0.7205308675765991, + "num_tokens": 174790113.0, + "step": 6895 + }, + { + "epoch": 0.7573028772238085, + "grad_norm": 2.3901965618133545, + "learning_rate": 1e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7211441993713379, + "num_tokens": 174811312.0, + "step": 6896 + }, + { + "epoch": 0.7574126949264222, + "grad_norm": 2.1916701793670654, + "learning_rate": 1e-06, + "loss": 0.9884, + "mean_token_accuracy": 0.6987271904945374, + "num_tokens": 174835480.0, + "step": 6897 + }, + { + "epoch": 0.7575225126290358, + "grad_norm": 2.3592288494110107, + "learning_rate": 1e-06, + "loss": 0.8948, + "mean_token_accuracy": 0.7139922380447388, + "num_tokens": 174856452.0, + "step": 6898 + }, + { + "epoch": 0.7576323303316495, + "grad_norm": 1.9255313873291016, + "learning_rate": 1e-06, + "loss": 1.0388, + "mean_token_accuracy": 0.6864356994628906, + "num_tokens": 174888560.0, + "step": 6899 + }, + { + "epoch": 0.7577421480342631, + "grad_norm": 2.0848560333251953, + "learning_rate": 1e-06, + "loss": 1.02, + "mean_token_accuracy": 0.6979547739028931, + "num_tokens": 174915946.0, + "step": 6900 + }, + { + "epoch": 0.7578519657368767, + "grad_norm": 1.942552089691162, + "learning_rate": 1e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.709298849105835, + "num_tokens": 174948950.0, + "step": 6901 + }, + { + "epoch": 0.7579617834394905, + "grad_norm": 2.6294703483581543, + "learning_rate": 1e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.7033990621566772, + "num_tokens": 174966646.0, + "step": 6902 + }, + { + "epoch": 0.7580716011421041, + "grad_norm": 2.077059745788574, + "learning_rate": 1e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.7124784588813782, + "num_tokens": 174992819.0, + "step": 6903 + }, + { + "epoch": 0.7581814188447178, + "grad_norm": 2.0999698638916016, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.7090579271316528, + "num_tokens": 175019937.0, + "step": 6904 + }, + { + "epoch": 0.7582912365473314, + "grad_norm": 1.9954184293746948, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7174319624900818, + "num_tokens": 175048514.0, + "step": 6905 + }, + { + "epoch": 0.7584010542499451, + "grad_norm": 2.1780152320861816, + "learning_rate": 1e-06, + "loss": 1.0965, + "mean_token_accuracy": 0.6726866960525513, + "num_tokens": 175075284.0, + "step": 6906 + }, + { + "epoch": 0.7585108719525587, + "grad_norm": 2.3900911808013916, + "learning_rate": 1e-06, + "loss": 0.8387, + "mean_token_accuracy": 0.7360528707504272, + "num_tokens": 175096520.0, + "step": 6907 + }, + { + "epoch": 0.7586206896551724, + "grad_norm": 2.1225147247314453, + "learning_rate": 1e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.6951375007629395, + "num_tokens": 175124236.0, + "step": 6908 + }, + { + "epoch": 0.758730507357786, + "grad_norm": 2.1720993518829346, + "learning_rate": 1e-06, + "loss": 0.9457, + "mean_token_accuracy": 0.7110759615898132, + "num_tokens": 175148848.0, + "step": 6909 + }, + { + "epoch": 0.7588403250603998, + "grad_norm": 2.3613665103912354, + "learning_rate": 1e-06, + "loss": 0.8924, + "mean_token_accuracy": 0.7249881029129028, + "num_tokens": 175169601.0, + "step": 6910 + }, + { + "epoch": 0.7589501427630134, + "grad_norm": 1.9877982139587402, + "learning_rate": 1e-06, + "loss": 0.8282, + "mean_token_accuracy": 0.7383125424385071, + "num_tokens": 175196178.0, + "step": 6911 + }, + { + "epoch": 0.7590599604656271, + "grad_norm": 2.0534324645996094, + "learning_rate": 1e-06, + "loss": 1.1179, + "mean_token_accuracy": 0.6644237041473389, + "num_tokens": 175228103.0, + "step": 6912 + }, + { + "epoch": 0.7591697781682407, + "grad_norm": 2.5523881912231445, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7110457420349121, + "num_tokens": 175248321.0, + "step": 6913 + }, + { + "epoch": 0.7592795958708544, + "grad_norm": 2.1147541999816895, + "learning_rate": 1e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.7084836959838867, + "num_tokens": 175275218.0, + "step": 6914 + }, + { + "epoch": 0.759389413573468, + "grad_norm": 2.200934648513794, + "learning_rate": 1e-06, + "loss": 0.9694, + "mean_token_accuracy": 0.7049455642700195, + "num_tokens": 175300803.0, + "step": 6915 + }, + { + "epoch": 0.7594992312760817, + "grad_norm": 2.3280489444732666, + "learning_rate": 1e-06, + "loss": 0.9923, + "mean_token_accuracy": 0.6968321800231934, + "num_tokens": 175324043.0, + "step": 6916 + }, + { + "epoch": 0.7596090489786954, + "grad_norm": 1.9446443319320679, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7015293836593628, + "num_tokens": 175355040.0, + "step": 6917 + }, + { + "epoch": 0.7597188666813091, + "grad_norm": 2.1662240028381348, + "learning_rate": 1e-06, + "loss": 1.0115, + "mean_token_accuracy": 0.6900221109390259, + "num_tokens": 175381462.0, + "step": 6918 + }, + { + "epoch": 0.7598286843839227, + "grad_norm": 2.1805737018585205, + "learning_rate": 1e-06, + "loss": 0.8616, + "mean_token_accuracy": 0.7313316464424133, + "num_tokens": 175406668.0, + "step": 6919 + }, + { + "epoch": 0.7599385020865363, + "grad_norm": 2.061837911605835, + "learning_rate": 1e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7017109990119934, + "num_tokens": 175434084.0, + "step": 6920 + }, + { + "epoch": 0.76004831978915, + "grad_norm": 1.9607110023498535, + "learning_rate": 1e-06, + "loss": 1.0974, + "mean_token_accuracy": 0.6723641753196716, + "num_tokens": 175466986.0, + "step": 6921 + }, + { + "epoch": 0.7601581374917636, + "grad_norm": 2.2463200092315674, + "learning_rate": 1e-06, + "loss": 0.9058, + "mean_token_accuracy": 0.7324570417404175, + "num_tokens": 175491739.0, + "step": 6922 + }, + { + "epoch": 0.7602679551943773, + "grad_norm": 2.2509329319000244, + "learning_rate": 1e-06, + "loss": 0.8586, + "mean_token_accuracy": 0.7360714077949524, + "num_tokens": 175514597.0, + "step": 6923 + }, + { + "epoch": 0.760377772896991, + "grad_norm": 2.2641146183013916, + "learning_rate": 1e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.6966395378112793, + "num_tokens": 175542002.0, + "step": 6924 + }, + { + "epoch": 0.7604875905996047, + "grad_norm": 2.6679513454437256, + "learning_rate": 1e-06, + "loss": 0.9139, + "mean_token_accuracy": 0.7186404466629028, + "num_tokens": 175561238.0, + "step": 6925 + }, + { + "epoch": 0.7605974083022183, + "grad_norm": 2.446821928024292, + "learning_rate": 1e-06, + "loss": 0.9008, + "mean_token_accuracy": 0.7253532409667969, + "num_tokens": 175581639.0, + "step": 6926 + }, + { + "epoch": 0.760707226004832, + "grad_norm": 2.130572557449341, + "learning_rate": 1e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7069685459136963, + "num_tokens": 175607081.0, + "step": 6927 + }, + { + "epoch": 0.7608170437074456, + "grad_norm": 1.9056466817855835, + "learning_rate": 1e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.6987269520759583, + "num_tokens": 175641209.0, + "step": 6928 + }, + { + "epoch": 0.7609268614100593, + "grad_norm": 2.574702501296997, + "learning_rate": 1e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7263843417167664, + "num_tokens": 175659466.0, + "step": 6929 + }, + { + "epoch": 0.7610366791126729, + "grad_norm": 2.6212782859802246, + "learning_rate": 1e-06, + "loss": 1.0342, + "mean_token_accuracy": 0.6902968883514404, + "num_tokens": 175681039.0, + "step": 6930 + }, + { + "epoch": 0.7611464968152867, + "grad_norm": 2.3442776203155518, + "learning_rate": 1e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.7082793712615967, + "num_tokens": 175704555.0, + "step": 6931 + }, + { + "epoch": 0.7612563145179003, + "grad_norm": 2.1609702110290527, + "learning_rate": 1e-06, + "loss": 1.0211, + "mean_token_accuracy": 0.6936355829238892, + "num_tokens": 175730861.0, + "step": 6932 + }, + { + "epoch": 0.761366132220514, + "grad_norm": 2.123891830444336, + "learning_rate": 1e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.709257185459137, + "num_tokens": 175755867.0, + "step": 6933 + }, + { + "epoch": 0.7614759499231276, + "grad_norm": 2.201421022415161, + "learning_rate": 1e-06, + "loss": 0.8816, + "mean_token_accuracy": 0.723946750164032, + "num_tokens": 175779054.0, + "step": 6934 + }, + { + "epoch": 0.7615857676257413, + "grad_norm": 2.2351133823394775, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7083526253700256, + "num_tokens": 175803217.0, + "step": 6935 + }, + { + "epoch": 0.7616955853283549, + "grad_norm": 1.8093843460083008, + "learning_rate": 1e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.6956260800361633, + "num_tokens": 175839030.0, + "step": 6936 + }, + { + "epoch": 0.7618054030309686, + "grad_norm": 2.470543622970581, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7144776582717896, + "num_tokens": 175859535.0, + "step": 6937 + }, + { + "epoch": 0.7619152207335822, + "grad_norm": 2.328489303588867, + "learning_rate": 1e-06, + "loss": 0.8662, + "mean_token_accuracy": 0.732899010181427, + "num_tokens": 175881144.0, + "step": 6938 + }, + { + "epoch": 0.762025038436196, + "grad_norm": 2.0263681411743164, + "learning_rate": 1e-06, + "loss": 0.8889, + "mean_token_accuracy": 0.7277543544769287, + "num_tokens": 175909088.0, + "step": 6939 + }, + { + "epoch": 0.7621348561388096, + "grad_norm": 2.2001123428344727, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7057466506958008, + "num_tokens": 175935067.0, + "step": 6940 + }, + { + "epoch": 0.7622446738414232, + "grad_norm": 2.293269395828247, + "learning_rate": 1e-06, + "loss": 0.8748, + "mean_token_accuracy": 0.725935697555542, + "num_tokens": 175956837.0, + "step": 6941 + }, + { + "epoch": 0.7623544915440369, + "grad_norm": 2.095082998275757, + "learning_rate": 1e-06, + "loss": 0.8816, + "mean_token_accuracy": 0.735572099685669, + "num_tokens": 175982832.0, + "step": 6942 + }, + { + "epoch": 0.7624643092466505, + "grad_norm": 2.0961532592773438, + "learning_rate": 1e-06, + "loss": 0.8394, + "mean_token_accuracy": 0.7428233623504639, + "num_tokens": 176008393.0, + "step": 6943 + }, + { + "epoch": 0.7625741269492642, + "grad_norm": 2.273230791091919, + "learning_rate": 1e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.6928793787956238, + "num_tokens": 176032667.0, + "step": 6944 + }, + { + "epoch": 0.7626839446518778, + "grad_norm": 2.0867574214935303, + "learning_rate": 1e-06, + "loss": 0.9327, + "mean_token_accuracy": 0.712207555770874, + "num_tokens": 176058966.0, + "step": 6945 + }, + { + "epoch": 0.7627937623544916, + "grad_norm": 2.140237331390381, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7118627429008484, + "num_tokens": 176087240.0, + "step": 6946 + }, + { + "epoch": 0.7629035800571052, + "grad_norm": 2.621361255645752, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.712051510810852, + "num_tokens": 176105883.0, + "step": 6947 + }, + { + "epoch": 0.7630133977597189, + "grad_norm": 2.146436929702759, + "learning_rate": 1e-06, + "loss": 1.094, + "mean_token_accuracy": 0.6773268580436707, + "num_tokens": 176134470.0, + "step": 6948 + }, + { + "epoch": 0.7631232154623325, + "grad_norm": 2.2241744995117188, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7239323258399963, + "num_tokens": 176157710.0, + "step": 6949 + }, + { + "epoch": 0.7632330331649462, + "grad_norm": 2.2983617782592773, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.712921142578125, + "num_tokens": 176180354.0, + "step": 6950 + }, + { + "epoch": 0.7633428508675598, + "grad_norm": 2.0357325077056885, + "learning_rate": 1e-06, + "loss": 1.0362, + "mean_token_accuracy": 0.6895549297332764, + "num_tokens": 176211709.0, + "step": 6951 + }, + { + "epoch": 0.7634526685701735, + "grad_norm": 2.1240928173065186, + "learning_rate": 1e-06, + "loss": 1.0088, + "mean_token_accuracy": 0.6937991380691528, + "num_tokens": 176239024.0, + "step": 6952 + }, + { + "epoch": 0.7635624862727872, + "grad_norm": 2.3962461948394775, + "learning_rate": 1e-06, + "loss": 0.8914, + "mean_token_accuracy": 0.7271558046340942, + "num_tokens": 176260790.0, + "step": 6953 + }, + { + "epoch": 0.7636723039754009, + "grad_norm": 2.2134850025177, + "learning_rate": 1e-06, + "loss": 1.0226, + "mean_token_accuracy": 0.6885973215103149, + "num_tokens": 176286504.0, + "step": 6954 + }, + { + "epoch": 0.7637821216780145, + "grad_norm": 2.0399975776672363, + "learning_rate": 1e-06, + "loss": 0.859, + "mean_token_accuracy": 0.736842930316925, + "num_tokens": 176311821.0, + "step": 6955 + }, + { + "epoch": 0.7638919393806282, + "grad_norm": 2.0219457149505615, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.6976678371429443, + "num_tokens": 176339900.0, + "step": 6956 + }, + { + "epoch": 0.7640017570832418, + "grad_norm": 2.3822762966156006, + "learning_rate": 1e-06, + "loss": 0.9661, + "mean_token_accuracy": 0.702008843421936, + "num_tokens": 176361020.0, + "step": 6957 + }, + { + "epoch": 0.7641115747858555, + "grad_norm": 2.0862393379211426, + "learning_rate": 1e-06, + "loss": 0.993, + "mean_token_accuracy": 0.6924030184745789, + "num_tokens": 176388785.0, + "step": 6958 + }, + { + "epoch": 0.7642213924884691, + "grad_norm": 2.1535778045654297, + "learning_rate": 1e-06, + "loss": 0.877, + "mean_token_accuracy": 0.7252194881439209, + "num_tokens": 176412779.0, + "step": 6959 + }, + { + "epoch": 0.7643312101910829, + "grad_norm": 2.4625966548919678, + "learning_rate": 1e-06, + "loss": 0.8784, + "mean_token_accuracy": 0.7225680351257324, + "num_tokens": 176432209.0, + "step": 6960 + }, + { + "epoch": 0.7644410278936965, + "grad_norm": 2.7462422847747803, + "learning_rate": 1e-06, + "loss": 0.9168, + "mean_token_accuracy": 0.7139236927032471, + "num_tokens": 176450223.0, + "step": 6961 + }, + { + "epoch": 0.7645508455963101, + "grad_norm": 2.131519079208374, + "learning_rate": 1e-06, + "loss": 1.0162, + "mean_token_accuracy": 0.6925095915794373, + "num_tokens": 176476167.0, + "step": 6962 + }, + { + "epoch": 0.7646606632989238, + "grad_norm": 1.956944227218628, + "learning_rate": 1e-06, + "loss": 1.0243, + "mean_token_accuracy": 0.7030231952667236, + "num_tokens": 176507025.0, + "step": 6963 + }, + { + "epoch": 0.7647704810015374, + "grad_norm": 2.2776143550872803, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.7140026092529297, + "num_tokens": 176530717.0, + "step": 6964 + }, + { + "epoch": 0.7648802987041511, + "grad_norm": 2.244375228881836, + "learning_rate": 1e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.7149889469146729, + "num_tokens": 176554762.0, + "step": 6965 + }, + { + "epoch": 0.7649901164067647, + "grad_norm": 2.1500232219696045, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7054626941680908, + "num_tokens": 176584402.0, + "step": 6966 + }, + { + "epoch": 0.7650999341093784, + "grad_norm": 2.4432945251464844, + "learning_rate": 1e-06, + "loss": 0.8958, + "mean_token_accuracy": 0.7291926741600037, + "num_tokens": 176605705.0, + "step": 6967 + }, + { + "epoch": 0.7652097518119921, + "grad_norm": 2.216975212097168, + "learning_rate": 1e-06, + "loss": 1.0037, + "mean_token_accuracy": 0.6983641386032104, + "num_tokens": 176631673.0, + "step": 6968 + }, + { + "epoch": 0.7653195695146058, + "grad_norm": 2.0904955863952637, + "learning_rate": 1e-06, + "loss": 0.9097, + "mean_token_accuracy": 0.7206270694732666, + "num_tokens": 176659046.0, + "step": 6969 + }, + { + "epoch": 0.7654293872172194, + "grad_norm": 1.9336601495742798, + "learning_rate": 1e-06, + "loss": 1.0104, + "mean_token_accuracy": 0.6873597502708435, + "num_tokens": 176689071.0, + "step": 6970 + }, + { + "epoch": 0.7655392049198331, + "grad_norm": 2.2621042728424072, + "learning_rate": 1e-06, + "loss": 0.987, + "mean_token_accuracy": 0.6972965002059937, + "num_tokens": 176712730.0, + "step": 6971 + }, + { + "epoch": 0.7656490226224467, + "grad_norm": 2.3884530067443848, + "learning_rate": 1e-06, + "loss": 1.0025, + "mean_token_accuracy": 0.71340012550354, + "num_tokens": 176734288.0, + "step": 6972 + }, + { + "epoch": 0.7657588403250604, + "grad_norm": 2.285696029663086, + "learning_rate": 1e-06, + "loss": 0.8855, + "mean_token_accuracy": 0.7308667898178101, + "num_tokens": 176756230.0, + "step": 6973 + }, + { + "epoch": 0.765868658027674, + "grad_norm": 2.300354480743408, + "learning_rate": 1e-06, + "loss": 0.961, + "mean_token_accuracy": 0.7038969993591309, + "num_tokens": 176779100.0, + "step": 6974 + }, + { + "epoch": 0.7659784757302878, + "grad_norm": 1.9877811670303345, + "learning_rate": 1e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7111458778381348, + "num_tokens": 176810174.0, + "step": 6975 + }, + { + "epoch": 0.7660882934329014, + "grad_norm": 2.224825143814087, + "learning_rate": 1e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.6954482793807983, + "num_tokens": 176834618.0, + "step": 6976 + }, + { + "epoch": 0.7661981111355151, + "grad_norm": 2.232731819152832, + "learning_rate": 1e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.704841136932373, + "num_tokens": 176858121.0, + "step": 6977 + }, + { + "epoch": 0.7663079288381287, + "grad_norm": 2.456176996231079, + "learning_rate": 1e-06, + "loss": 0.856, + "mean_token_accuracy": 0.7237248420715332, + "num_tokens": 176876783.0, + "step": 6978 + }, + { + "epoch": 0.7664177465407424, + "grad_norm": 2.4929654598236084, + "learning_rate": 1e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.6984720230102539, + "num_tokens": 176897869.0, + "step": 6979 + }, + { + "epoch": 0.766527564243356, + "grad_norm": 2.0463626384735107, + "learning_rate": 1e-06, + "loss": 1.0389, + "mean_token_accuracy": 0.6875603199005127, + "num_tokens": 176926755.0, + "step": 6980 + }, + { + "epoch": 0.7666373819459696, + "grad_norm": 2.2074124813079834, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.7173538208007812, + "num_tokens": 176951209.0, + "step": 6981 + }, + { + "epoch": 0.7667471996485834, + "grad_norm": 2.1228184700012207, + "learning_rate": 1e-06, + "loss": 1.0654, + "mean_token_accuracy": 0.6758534908294678, + "num_tokens": 176979344.0, + "step": 6982 + }, + { + "epoch": 0.766857017351197, + "grad_norm": 2.6311299800872803, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7177896499633789, + "num_tokens": 176998436.0, + "step": 6983 + }, + { + "epoch": 0.7669668350538107, + "grad_norm": 2.0207786560058594, + "learning_rate": 1e-06, + "loss": 0.9687, + "mean_token_accuracy": 0.7055385708808899, + "num_tokens": 177028527.0, + "step": 6984 + }, + { + "epoch": 0.7670766527564243, + "grad_norm": 2.0807578563690186, + "learning_rate": 1e-06, + "loss": 0.8597, + "mean_token_accuracy": 0.7305857539176941, + "num_tokens": 177054660.0, + "step": 6985 + }, + { + "epoch": 0.767186470459038, + "grad_norm": 2.346379280090332, + "learning_rate": 1e-06, + "loss": 0.9544, + "mean_token_accuracy": 0.7138919234275818, + "num_tokens": 177076086.0, + "step": 6986 + }, + { + "epoch": 0.7672962881616516, + "grad_norm": 2.326975107192993, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7124409675598145, + "num_tokens": 177098449.0, + "step": 6987 + }, + { + "epoch": 0.7674061058642653, + "grad_norm": 1.9335688352584839, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.6950228214263916, + "num_tokens": 177130955.0, + "step": 6988 + }, + { + "epoch": 0.767515923566879, + "grad_norm": 2.3019962310791016, + "learning_rate": 1e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.703821063041687, + "num_tokens": 177153981.0, + "step": 6989 + }, + { + "epoch": 0.7676257412694927, + "grad_norm": 2.0589420795440674, + "learning_rate": 1e-06, + "loss": 1.0025, + "mean_token_accuracy": 0.7023196816444397, + "num_tokens": 177185190.0, + "step": 6990 + }, + { + "epoch": 0.7677355589721063, + "grad_norm": 2.2458488941192627, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.7004677057266235, + "num_tokens": 177209576.0, + "step": 6991 + }, + { + "epoch": 0.76784537667472, + "grad_norm": 1.9773669242858887, + "learning_rate": 1e-06, + "loss": 1.0554, + "mean_token_accuracy": 0.6821579933166504, + "num_tokens": 177243431.0, + "step": 6992 + }, + { + "epoch": 0.7679551943773336, + "grad_norm": 2.1164681911468506, + "learning_rate": 1e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.714322030544281, + "num_tokens": 177271420.0, + "step": 6993 + }, + { + "epoch": 0.7680650120799473, + "grad_norm": 2.234962224960327, + "learning_rate": 1e-06, + "loss": 0.8848, + "mean_token_accuracy": 0.7324534058570862, + "num_tokens": 177293902.0, + "step": 6994 + }, + { + "epoch": 0.7681748297825609, + "grad_norm": 2.0998711585998535, + "learning_rate": 1e-06, + "loss": 0.9449, + "mean_token_accuracy": 0.7063481211662292, + "num_tokens": 177320841.0, + "step": 6995 + }, + { + "epoch": 0.7682846474851747, + "grad_norm": 2.018852710723877, + "learning_rate": 1e-06, + "loss": 1.016, + "mean_token_accuracy": 0.6907063126564026, + "num_tokens": 177350606.0, + "step": 6996 + }, + { + "epoch": 0.7683944651877883, + "grad_norm": 2.2824220657348633, + "learning_rate": 1e-06, + "loss": 0.9779, + "mean_token_accuracy": 0.6971600651741028, + "num_tokens": 177374668.0, + "step": 6997 + }, + { + "epoch": 0.768504282890402, + "grad_norm": 2.207401752471924, + "learning_rate": 1e-06, + "loss": 1.0157, + "mean_token_accuracy": 0.6883106827735901, + "num_tokens": 177399816.0, + "step": 6998 + }, + { + "epoch": 0.7686141005930156, + "grad_norm": 2.1324329376220703, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7132366299629211, + "num_tokens": 177427922.0, + "step": 6999 + }, + { + "epoch": 0.7687239182956292, + "grad_norm": 2.5301156044006348, + "learning_rate": 1e-06, + "loss": 0.9022, + "mean_token_accuracy": 0.7277094721794128, + "num_tokens": 177449270.0, + "step": 7000 + }, + { + "epoch": 0.7688337359982429, + "grad_norm": 2.3074839115142822, + "learning_rate": 1e-06, + "loss": 0.9188, + "mean_token_accuracy": 0.7140867710113525, + "num_tokens": 177470490.0, + "step": 7001 + }, + { + "epoch": 0.7689435537008565, + "grad_norm": 2.1704399585723877, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7041856050491333, + "num_tokens": 177496858.0, + "step": 7002 + }, + { + "epoch": 0.7690533714034702, + "grad_norm": 2.0493078231811523, + "learning_rate": 1e-06, + "loss": 0.9825, + "mean_token_accuracy": 0.7053631544113159, + "num_tokens": 177523539.0, + "step": 7003 + }, + { + "epoch": 0.7691631891060839, + "grad_norm": 2.1065006256103516, + "learning_rate": 1e-06, + "loss": 0.8638, + "mean_token_accuracy": 0.7294820547103882, + "num_tokens": 177550682.0, + "step": 7004 + }, + { + "epoch": 0.7692730068086976, + "grad_norm": 2.334697961807251, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.7130318880081177, + "num_tokens": 177571958.0, + "step": 7005 + }, + { + "epoch": 0.7693828245113112, + "grad_norm": 2.4353506565093994, + "learning_rate": 1e-06, + "loss": 0.9162, + "mean_token_accuracy": 0.7160974740982056, + "num_tokens": 177592218.0, + "step": 7006 + }, + { + "epoch": 0.7694926422139249, + "grad_norm": 2.0450844764709473, + "learning_rate": 1e-06, + "loss": 0.8902, + "mean_token_accuracy": 0.7220932245254517, + "num_tokens": 177619098.0, + "step": 7007 + }, + { + "epoch": 0.7696024599165385, + "grad_norm": 2.2449498176574707, + "learning_rate": 1e-06, + "loss": 0.9733, + "mean_token_accuracy": 0.7011269330978394, + "num_tokens": 177644295.0, + "step": 7008 + }, + { + "epoch": 0.7697122776191522, + "grad_norm": 2.242428779602051, + "learning_rate": 1e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.7321180105209351, + "num_tokens": 177667893.0, + "step": 7009 + }, + { + "epoch": 0.7698220953217658, + "grad_norm": 1.9516003131866455, + "learning_rate": 1e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.695004940032959, + "num_tokens": 177698632.0, + "step": 7010 + }, + { + "epoch": 0.7699319130243796, + "grad_norm": 2.194004774093628, + "learning_rate": 1e-06, + "loss": 0.9841, + "mean_token_accuracy": 0.7042774558067322, + "num_tokens": 177723175.0, + "step": 7011 + }, + { + "epoch": 0.7700417307269932, + "grad_norm": 2.2360153198242188, + "learning_rate": 1e-06, + "loss": 1.0049, + "mean_token_accuracy": 0.7018857598304749, + "num_tokens": 177748573.0, + "step": 7012 + }, + { + "epoch": 0.7701515484296069, + "grad_norm": 2.4562301635742188, + "learning_rate": 1e-06, + "loss": 0.8711, + "mean_token_accuracy": 0.7283910512924194, + "num_tokens": 177768104.0, + "step": 7013 + }, + { + "epoch": 0.7702613661322205, + "grad_norm": 2.522369623184204, + "learning_rate": 1e-06, + "loss": 0.889, + "mean_token_accuracy": 0.7191704511642456, + "num_tokens": 177787812.0, + "step": 7014 + }, + { + "epoch": 0.7703711838348342, + "grad_norm": 2.0435705184936523, + "learning_rate": 1e-06, + "loss": 1.0124, + "mean_token_accuracy": 0.6933314800262451, + "num_tokens": 177815235.0, + "step": 7015 + }, + { + "epoch": 0.7704810015374478, + "grad_norm": 2.2479352951049805, + "learning_rate": 1e-06, + "loss": 0.9498, + "mean_token_accuracy": 0.7119839191436768, + "num_tokens": 177841232.0, + "step": 7016 + }, + { + "epoch": 0.7705908192400615, + "grad_norm": 2.3466477394104004, + "learning_rate": 1e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7100542187690735, + "num_tokens": 177863262.0, + "step": 7017 + }, + { + "epoch": 0.7707006369426752, + "grad_norm": 2.4004571437835693, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7124817967414856, + "num_tokens": 177883225.0, + "step": 7018 + }, + { + "epoch": 0.7708104546452889, + "grad_norm": 2.0908877849578857, + "learning_rate": 1e-06, + "loss": 1.0035, + "mean_token_accuracy": 0.691929280757904, + "num_tokens": 177911438.0, + "step": 7019 + }, + { + "epoch": 0.7709202723479025, + "grad_norm": 2.0465805530548096, + "learning_rate": 1e-06, + "loss": 1.0416, + "mean_token_accuracy": 0.682969331741333, + "num_tokens": 177940773.0, + "step": 7020 + }, + { + "epoch": 0.7710300900505161, + "grad_norm": 2.050858736038208, + "learning_rate": 1e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.7187163829803467, + "num_tokens": 177965953.0, + "step": 7021 + }, + { + "epoch": 0.7711399077531298, + "grad_norm": 2.174767017364502, + "learning_rate": 1e-06, + "loss": 1.0364, + "mean_token_accuracy": 0.6895542144775391, + "num_tokens": 177994163.0, + "step": 7022 + }, + { + "epoch": 0.7712497254557434, + "grad_norm": 2.4325244426727295, + "learning_rate": 1e-06, + "loss": 1.0155, + "mean_token_accuracy": 0.7152982354164124, + "num_tokens": 178015638.0, + "step": 7023 + }, + { + "epoch": 0.7713595431583571, + "grad_norm": 2.188309907913208, + "learning_rate": 1e-06, + "loss": 0.8702, + "mean_token_accuracy": 0.7326331734657288, + "num_tokens": 178041376.0, + "step": 7024 + }, + { + "epoch": 0.7714693608609708, + "grad_norm": 2.123403310775757, + "learning_rate": 1e-06, + "loss": 1.0187, + "mean_token_accuracy": 0.6967483162879944, + "num_tokens": 178069778.0, + "step": 7025 + }, + { + "epoch": 0.7715791785635845, + "grad_norm": 2.212315082550049, + "learning_rate": 1e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7015388011932373, + "num_tokens": 178094834.0, + "step": 7026 + }, + { + "epoch": 0.7716889962661981, + "grad_norm": 2.333913803100586, + "learning_rate": 1e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7107424139976501, + "num_tokens": 178120726.0, + "step": 7027 + }, + { + "epoch": 0.7717988139688118, + "grad_norm": 2.4206082820892334, + "learning_rate": 1e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.7019909620285034, + "num_tokens": 178142068.0, + "step": 7028 + }, + { + "epoch": 0.7719086316714254, + "grad_norm": 2.3479204177856445, + "learning_rate": 1e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.7228370308876038, + "num_tokens": 178166680.0, + "step": 7029 + }, + { + "epoch": 0.7720184493740391, + "grad_norm": 2.5351173877716064, + "learning_rate": 1e-06, + "loss": 0.989, + "mean_token_accuracy": 0.7117657661437988, + "num_tokens": 178187163.0, + "step": 7030 + }, + { + "epoch": 0.7721282670766527, + "grad_norm": 2.221925735473633, + "learning_rate": 1e-06, + "loss": 0.8915, + "mean_token_accuracy": 0.7270339727401733, + "num_tokens": 178211325.0, + "step": 7031 + }, + { + "epoch": 0.7722380847792664, + "grad_norm": 2.3411405086517334, + "learning_rate": 1e-06, + "loss": 1.0106, + "mean_token_accuracy": 0.6900087594985962, + "num_tokens": 178235591.0, + "step": 7032 + }, + { + "epoch": 0.7723479024818801, + "grad_norm": 2.4297597408294678, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7068359851837158, + "num_tokens": 178257971.0, + "step": 7033 + }, + { + "epoch": 0.7724577201844938, + "grad_norm": 2.165585517883301, + "learning_rate": 1e-06, + "loss": 1.0053, + "mean_token_accuracy": 0.6905478835105896, + "num_tokens": 178284407.0, + "step": 7034 + }, + { + "epoch": 0.7725675378871074, + "grad_norm": 2.4124083518981934, + "learning_rate": 1e-06, + "loss": 0.952, + "mean_token_accuracy": 0.7281080484390259, + "num_tokens": 178305359.0, + "step": 7035 + }, + { + "epoch": 0.7726773555897211, + "grad_norm": 2.0320589542388916, + "learning_rate": 1e-06, + "loss": 0.8808, + "mean_token_accuracy": 0.7236502170562744, + "num_tokens": 178332834.0, + "step": 7036 + }, + { + "epoch": 0.7727871732923347, + "grad_norm": 2.24170184135437, + "learning_rate": 1e-06, + "loss": 0.9801, + "mean_token_accuracy": 0.6956561207771301, + "num_tokens": 178359775.0, + "step": 7037 + }, + { + "epoch": 0.7728969909949484, + "grad_norm": 2.546957015991211, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7129784822463989, + "num_tokens": 178380983.0, + "step": 7038 + }, + { + "epoch": 0.773006808697562, + "grad_norm": 2.2904562950134277, + "learning_rate": 1e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.7053890824317932, + "num_tokens": 178404510.0, + "step": 7039 + }, + { + "epoch": 0.7731166264001758, + "grad_norm": 2.394622802734375, + "learning_rate": 1e-06, + "loss": 1.01, + "mean_token_accuracy": 0.6962532997131348, + "num_tokens": 178428050.0, + "step": 7040 + }, + { + "epoch": 0.7732264441027894, + "grad_norm": 2.2586357593536377, + "learning_rate": 1e-06, + "loss": 0.98, + "mean_token_accuracy": 0.7001718282699585, + "num_tokens": 178453156.0, + "step": 7041 + }, + { + "epoch": 0.773336261805403, + "grad_norm": 2.0332469940185547, + "learning_rate": 1e-06, + "loss": 0.9773, + "mean_token_accuracy": 0.714749813079834, + "num_tokens": 178481487.0, + "step": 7042 + }, + { + "epoch": 0.7734460795080167, + "grad_norm": 2.2309491634368896, + "learning_rate": 1e-06, + "loss": 0.9107, + "mean_token_accuracy": 0.7204053401947021, + "num_tokens": 178504508.0, + "step": 7043 + }, + { + "epoch": 0.7735558972106303, + "grad_norm": 2.214951992034912, + "learning_rate": 1e-06, + "loss": 0.9146, + "mean_token_accuracy": 0.7190119028091431, + "num_tokens": 178530962.0, + "step": 7044 + }, + { + "epoch": 0.773665714913244, + "grad_norm": 2.060774564743042, + "learning_rate": 1e-06, + "loss": 1.0316, + "mean_token_accuracy": 0.6872890591621399, + "num_tokens": 178561482.0, + "step": 7045 + }, + { + "epoch": 0.7737755326158576, + "grad_norm": 2.0309746265411377, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.7028456330299377, + "num_tokens": 178588081.0, + "step": 7046 + }, + { + "epoch": 0.7738853503184714, + "grad_norm": 2.0752127170562744, + "learning_rate": 1e-06, + "loss": 0.9446, + "mean_token_accuracy": 0.713900089263916, + "num_tokens": 178615646.0, + "step": 7047 + }, + { + "epoch": 0.773995168021085, + "grad_norm": 2.310689926147461, + "learning_rate": 1e-06, + "loss": 1.0333, + "mean_token_accuracy": 0.6857904195785522, + "num_tokens": 178641367.0, + "step": 7048 + }, + { + "epoch": 0.7741049857236987, + "grad_norm": 2.30474853515625, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7163321375846863, + "num_tokens": 178664816.0, + "step": 7049 + }, + { + "epoch": 0.7742148034263123, + "grad_norm": 2.3800101280212402, + "learning_rate": 1e-06, + "loss": 0.8744, + "mean_token_accuracy": 0.7374341487884521, + "num_tokens": 178685847.0, + "step": 7050 + }, + { + "epoch": 0.774324621128926, + "grad_norm": 2.2209813594818115, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7120206356048584, + "num_tokens": 178710642.0, + "step": 7051 + }, + { + "epoch": 0.7744344388315396, + "grad_norm": 2.226344108581543, + "learning_rate": 1e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.7185060381889343, + "num_tokens": 178735677.0, + "step": 7052 + }, + { + "epoch": 0.7745442565341533, + "grad_norm": 2.6636383533477783, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7144626975059509, + "num_tokens": 178754296.0, + "step": 7053 + }, + { + "epoch": 0.774654074236767, + "grad_norm": 2.2603862285614014, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7124506831169128, + "num_tokens": 178779337.0, + "step": 7054 + }, + { + "epoch": 0.7747638919393807, + "grad_norm": 2.265195608139038, + "learning_rate": 1e-06, + "loss": 0.894, + "mean_token_accuracy": 0.7238624095916748, + "num_tokens": 178803171.0, + "step": 7055 + }, + { + "epoch": 0.7748737096419943, + "grad_norm": 2.2102255821228027, + "learning_rate": 1e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.7097135186195374, + "num_tokens": 178828712.0, + "step": 7056 + }, + { + "epoch": 0.774983527344608, + "grad_norm": 2.3727927207946777, + "learning_rate": 1e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.7133183479309082, + "num_tokens": 178850564.0, + "step": 7057 + }, + { + "epoch": 0.7750933450472216, + "grad_norm": 2.3800151348114014, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.7012033462524414, + "num_tokens": 178873801.0, + "step": 7058 + }, + { + "epoch": 0.7752031627498353, + "grad_norm": 2.0106301307678223, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.6948956847190857, + "num_tokens": 178904338.0, + "step": 7059 + }, + { + "epoch": 0.7753129804524489, + "grad_norm": 2.552781343460083, + "learning_rate": 1e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7128279209136963, + "num_tokens": 178924803.0, + "step": 7060 + }, + { + "epoch": 0.7754227981550625, + "grad_norm": 2.7640583515167236, + "learning_rate": 1e-06, + "loss": 0.8606, + "mean_token_accuracy": 0.7340503931045532, + "num_tokens": 178943153.0, + "step": 7061 + }, + { + "epoch": 0.7755326158576763, + "grad_norm": 2.31835675239563, + "learning_rate": 1e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7195791602134705, + "num_tokens": 178965335.0, + "step": 7062 + }, + { + "epoch": 0.77564243356029, + "grad_norm": 2.4967200756073, + "learning_rate": 1e-06, + "loss": 0.852, + "mean_token_accuracy": 0.7320220470428467, + "num_tokens": 178983827.0, + "step": 7063 + }, + { + "epoch": 0.7757522512629036, + "grad_norm": 2.0164687633514404, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.70870041847229, + "num_tokens": 179011820.0, + "step": 7064 + }, + { + "epoch": 0.7758620689655172, + "grad_norm": 2.234649419784546, + "learning_rate": 1e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.705342710018158, + "num_tokens": 179035925.0, + "step": 7065 + }, + { + "epoch": 0.7759718866681309, + "grad_norm": 2.148369073867798, + "learning_rate": 1e-06, + "loss": 0.98, + "mean_token_accuracy": 0.699741005897522, + "num_tokens": 179060212.0, + "step": 7066 + }, + { + "epoch": 0.7760817043707445, + "grad_norm": 2.3442158699035645, + "learning_rate": 1e-06, + "loss": 0.8755, + "mean_token_accuracy": 0.7327489852905273, + "num_tokens": 179082222.0, + "step": 7067 + }, + { + "epoch": 0.7761915220733582, + "grad_norm": 1.9626551866531372, + "learning_rate": 1e-06, + "loss": 1.009, + "mean_token_accuracy": 0.700068473815918, + "num_tokens": 179112038.0, + "step": 7068 + }, + { + "epoch": 0.7763013397759719, + "grad_norm": 2.2076330184936523, + "learning_rate": 1e-06, + "loss": 0.9776, + "mean_token_accuracy": 0.7079133987426758, + "num_tokens": 179136604.0, + "step": 7069 + }, + { + "epoch": 0.7764111574785856, + "grad_norm": 2.4623193740844727, + "learning_rate": 1e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.7228535413742065, + "num_tokens": 179155981.0, + "step": 7070 + }, + { + "epoch": 0.7765209751811992, + "grad_norm": 2.170898675918579, + "learning_rate": 1e-06, + "loss": 0.9349, + "mean_token_accuracy": 0.7113635540008545, + "num_tokens": 179180803.0, + "step": 7071 + }, + { + "epoch": 0.7766307928838129, + "grad_norm": 2.5709426403045654, + "learning_rate": 1e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.7074155807495117, + "num_tokens": 179201416.0, + "step": 7072 + }, + { + "epoch": 0.7767406105864265, + "grad_norm": 2.3767716884613037, + "learning_rate": 1e-06, + "loss": 1.0268, + "mean_token_accuracy": 0.6892225742340088, + "num_tokens": 179224783.0, + "step": 7073 + }, + { + "epoch": 0.7768504282890402, + "grad_norm": 2.184709310531616, + "learning_rate": 1e-06, + "loss": 0.9104, + "mean_token_accuracy": 0.7220743894577026, + "num_tokens": 179248662.0, + "step": 7074 + }, + { + "epoch": 0.7769602459916538, + "grad_norm": 2.413346290588379, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7213810086250305, + "num_tokens": 179269131.0, + "step": 7075 + }, + { + "epoch": 0.7770700636942676, + "grad_norm": 2.01743483543396, + "learning_rate": 1e-06, + "loss": 0.905, + "mean_token_accuracy": 0.7234665155410767, + "num_tokens": 179297098.0, + "step": 7076 + }, + { + "epoch": 0.7771798813968812, + "grad_norm": 2.1281111240386963, + "learning_rate": 1e-06, + "loss": 0.9804, + "mean_token_accuracy": 0.698932409286499, + "num_tokens": 179322292.0, + "step": 7077 + }, + { + "epoch": 0.7772896990994949, + "grad_norm": 1.9461393356323242, + "learning_rate": 1e-06, + "loss": 1.0041, + "mean_token_accuracy": 0.6889413595199585, + "num_tokens": 179354844.0, + "step": 7078 + }, + { + "epoch": 0.7773995168021085, + "grad_norm": 2.251168727874756, + "learning_rate": 1e-06, + "loss": 0.9029, + "mean_token_accuracy": 0.7229845523834229, + "num_tokens": 179376922.0, + "step": 7079 + }, + { + "epoch": 0.7775093345047221, + "grad_norm": 2.056157350540161, + "learning_rate": 1e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.6901798248291016, + "num_tokens": 179405570.0, + "step": 7080 + }, + { + "epoch": 0.7776191522073358, + "grad_norm": 2.040438413619995, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7104302644729614, + "num_tokens": 179432939.0, + "step": 7081 + }, + { + "epoch": 0.7777289699099494, + "grad_norm": 2.451167345046997, + "learning_rate": 1e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.7025101184844971, + "num_tokens": 179454034.0, + "step": 7082 + }, + { + "epoch": 0.7778387876125632, + "grad_norm": 1.8757659196853638, + "learning_rate": 1e-06, + "loss": 0.967, + "mean_token_accuracy": 0.7088644504547119, + "num_tokens": 179485448.0, + "step": 7083 + }, + { + "epoch": 0.7779486053151768, + "grad_norm": 2.1316449642181396, + "learning_rate": 1e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.6926569938659668, + "num_tokens": 179512698.0, + "step": 7084 + }, + { + "epoch": 0.7780584230177905, + "grad_norm": 2.7438642978668213, + "learning_rate": 1e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.7282918691635132, + "num_tokens": 179529123.0, + "step": 7085 + }, + { + "epoch": 0.7781682407204041, + "grad_norm": 2.262704610824585, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7118374109268188, + "num_tokens": 179553256.0, + "step": 7086 + }, + { + "epoch": 0.7782780584230178, + "grad_norm": 2.3139114379882812, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.7024273872375488, + "num_tokens": 179577483.0, + "step": 7087 + }, + { + "epoch": 0.7783878761256314, + "grad_norm": 2.0233042240142822, + "learning_rate": 1e-06, + "loss": 1.095, + "mean_token_accuracy": 0.6798396110534668, + "num_tokens": 179608451.0, + "step": 7088 + }, + { + "epoch": 0.7784976938282451, + "grad_norm": 2.4469082355499268, + "learning_rate": 1e-06, + "loss": 0.8976, + "mean_token_accuracy": 0.7110005617141724, + "num_tokens": 179629542.0, + "step": 7089 + }, + { + "epoch": 0.7786075115308587, + "grad_norm": 2.505898952484131, + "learning_rate": 1e-06, + "loss": 0.7991, + "mean_token_accuracy": 0.7437442541122437, + "num_tokens": 179646790.0, + "step": 7090 + }, + { + "epoch": 0.7787173292334725, + "grad_norm": 2.4614827632904053, + "learning_rate": 1e-06, + "loss": 0.976, + "mean_token_accuracy": 0.7007860541343689, + "num_tokens": 179669282.0, + "step": 7091 + }, + { + "epoch": 0.7788271469360861, + "grad_norm": 2.1343486309051514, + "learning_rate": 1e-06, + "loss": 1.025, + "mean_token_accuracy": 0.6838500499725342, + "num_tokens": 179697089.0, + "step": 7092 + }, + { + "epoch": 0.7789369646386998, + "grad_norm": 1.9251229763031006, + "learning_rate": 1e-06, + "loss": 1.043, + "mean_token_accuracy": 0.6844090223312378, + "num_tokens": 179731507.0, + "step": 7093 + }, + { + "epoch": 0.7790467823413134, + "grad_norm": 2.4280872344970703, + "learning_rate": 1e-06, + "loss": 1.0287, + "mean_token_accuracy": 0.68711918592453, + "num_tokens": 179752569.0, + "step": 7094 + }, + { + "epoch": 0.7791566000439271, + "grad_norm": 2.40913724899292, + "learning_rate": 1e-06, + "loss": 0.9668, + "mean_token_accuracy": 0.7182817459106445, + "num_tokens": 179774299.0, + "step": 7095 + }, + { + "epoch": 0.7792664177465407, + "grad_norm": 2.3949315547943115, + "learning_rate": 1e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.6927556395530701, + "num_tokens": 179798589.0, + "step": 7096 + }, + { + "epoch": 0.7793762354491544, + "grad_norm": 2.2869415283203125, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7071710824966431, + "num_tokens": 179820498.0, + "step": 7097 + }, + { + "epoch": 0.7794860531517681, + "grad_norm": 2.2848892211914062, + "learning_rate": 1e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.7229636907577515, + "num_tokens": 179843335.0, + "step": 7098 + }, + { + "epoch": 0.7795958708543818, + "grad_norm": 2.037635087966919, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7059617638587952, + "num_tokens": 179872778.0, + "step": 7099 + }, + { + "epoch": 0.7797056885569954, + "grad_norm": 2.129899024963379, + "learning_rate": 1e-06, + "loss": 0.9479, + "mean_token_accuracy": 0.711457371711731, + "num_tokens": 179899661.0, + "step": 7100 + }, + { + "epoch": 0.779815506259609, + "grad_norm": 2.1907918453216553, + "learning_rate": 1e-06, + "loss": 1.0039, + "mean_token_accuracy": 0.6929113268852234, + "num_tokens": 179926595.0, + "step": 7101 + }, + { + "epoch": 0.7799253239622227, + "grad_norm": 2.0739729404449463, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.7132043838500977, + "num_tokens": 179954559.0, + "step": 7102 + }, + { + "epoch": 0.7800351416648363, + "grad_norm": 2.307727575302124, + "learning_rate": 1e-06, + "loss": 0.9645, + "mean_token_accuracy": 0.712765634059906, + "num_tokens": 179977423.0, + "step": 7103 + }, + { + "epoch": 0.78014495936745, + "grad_norm": 2.136479139328003, + "learning_rate": 1e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.7019742727279663, + "num_tokens": 180003567.0, + "step": 7104 + }, + { + "epoch": 0.7802547770700637, + "grad_norm": 2.051190137863159, + "learning_rate": 1e-06, + "loss": 1.0786, + "mean_token_accuracy": 0.6757291555404663, + "num_tokens": 180033881.0, + "step": 7105 + }, + { + "epoch": 0.7803645947726774, + "grad_norm": 1.8855282068252563, + "learning_rate": 1e-06, + "loss": 0.959, + "mean_token_accuracy": 0.7064105272293091, + "num_tokens": 180065263.0, + "step": 7106 + }, + { + "epoch": 0.780474412475291, + "grad_norm": 2.245919942855835, + "learning_rate": 1e-06, + "loss": 0.9695, + "mean_token_accuracy": 0.6980609893798828, + "num_tokens": 180088870.0, + "step": 7107 + }, + { + "epoch": 0.7805842301779047, + "grad_norm": 1.9827312231063843, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.7131274938583374, + "num_tokens": 180114237.0, + "step": 7108 + }, + { + "epoch": 0.7806940478805183, + "grad_norm": 2.0468804836273193, + "learning_rate": 1e-06, + "loss": 0.9048, + "mean_token_accuracy": 0.7147074937820435, + "num_tokens": 180140652.0, + "step": 7109 + }, + { + "epoch": 0.780803865583132, + "grad_norm": 2.281320810317993, + "learning_rate": 1e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.7134425044059753, + "num_tokens": 180163398.0, + "step": 7110 + }, + { + "epoch": 0.7809136832857456, + "grad_norm": 2.2027781009674072, + "learning_rate": 1e-06, + "loss": 1.0273, + "mean_token_accuracy": 0.6918683648109436, + "num_tokens": 180189004.0, + "step": 7111 + }, + { + "epoch": 0.7810235009883594, + "grad_norm": 1.9523528814315796, + "learning_rate": 1e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.7010420560836792, + "num_tokens": 180218909.0, + "step": 7112 + }, + { + "epoch": 0.781133318690973, + "grad_norm": 2.209707498550415, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.7199904322624207, + "num_tokens": 180243977.0, + "step": 7113 + }, + { + "epoch": 0.7812431363935867, + "grad_norm": 2.255502223968506, + "learning_rate": 1e-06, + "loss": 0.9214, + "mean_token_accuracy": 0.7192504405975342, + "num_tokens": 180267010.0, + "step": 7114 + }, + { + "epoch": 0.7813529540962003, + "grad_norm": 2.2841298580169678, + "learning_rate": 1e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.7096176147460938, + "num_tokens": 180290378.0, + "step": 7115 + }, + { + "epoch": 0.781462771798814, + "grad_norm": 2.167401075363159, + "learning_rate": 1e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.7089289426803589, + "num_tokens": 180315036.0, + "step": 7116 + }, + { + "epoch": 0.7815725895014276, + "grad_norm": 2.2144572734832764, + "learning_rate": 1e-06, + "loss": 0.9951, + "mean_token_accuracy": 0.6967758536338806, + "num_tokens": 180341505.0, + "step": 7117 + }, + { + "epoch": 0.7816824072040413, + "grad_norm": 2.4178757667541504, + "learning_rate": 1e-06, + "loss": 0.9146, + "mean_token_accuracy": 0.7184953689575195, + "num_tokens": 180362746.0, + "step": 7118 + }, + { + "epoch": 0.7817922249066549, + "grad_norm": 2.112807035446167, + "learning_rate": 1e-06, + "loss": 0.9708, + "mean_token_accuracy": 0.704974889755249, + "num_tokens": 180391293.0, + "step": 7119 + }, + { + "epoch": 0.7819020426092687, + "grad_norm": 2.149611473083496, + "learning_rate": 1e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.7080668210983276, + "num_tokens": 180417977.0, + "step": 7120 + }, + { + "epoch": 0.7820118603118823, + "grad_norm": 2.2531654834747314, + "learning_rate": 1e-06, + "loss": 0.981, + "mean_token_accuracy": 0.7000689506530762, + "num_tokens": 180443182.0, + "step": 7121 + }, + { + "epoch": 0.782121678014496, + "grad_norm": 2.348355293273926, + "learning_rate": 1e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.6978178024291992, + "num_tokens": 180466738.0, + "step": 7122 + }, + { + "epoch": 0.7822314957171096, + "grad_norm": 2.064785957336426, + "learning_rate": 1e-06, + "loss": 0.9884, + "mean_token_accuracy": 0.7009601593017578, + "num_tokens": 180497239.0, + "step": 7123 + }, + { + "epoch": 0.7823413134197232, + "grad_norm": 2.063035488128662, + "learning_rate": 1e-06, + "loss": 0.9856, + "mean_token_accuracy": 0.7025096416473389, + "num_tokens": 180525458.0, + "step": 7124 + }, + { + "epoch": 0.7824511311223369, + "grad_norm": 2.036741256713867, + "learning_rate": 1e-06, + "loss": 1.0372, + "mean_token_accuracy": 0.6866459846496582, + "num_tokens": 180555479.0, + "step": 7125 + }, + { + "epoch": 0.7825609488249505, + "grad_norm": 2.054818630218506, + "learning_rate": 1e-06, + "loss": 1.0827, + "mean_token_accuracy": 0.6740458011627197, + "num_tokens": 180582872.0, + "step": 7126 + }, + { + "epoch": 0.7826707665275643, + "grad_norm": 2.3437769412994385, + "learning_rate": 1e-06, + "loss": 0.8519, + "mean_token_accuracy": 0.7324252724647522, + "num_tokens": 180603692.0, + "step": 7127 + }, + { + "epoch": 0.7827805842301779, + "grad_norm": 2.127891778945923, + "learning_rate": 1e-06, + "loss": 1.0827, + "mean_token_accuracy": 0.6737020611763, + "num_tokens": 180630878.0, + "step": 7128 + }, + { + "epoch": 0.7828904019327916, + "grad_norm": 2.192418336868286, + "learning_rate": 1e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.6957187652587891, + "num_tokens": 180656071.0, + "step": 7129 + }, + { + "epoch": 0.7830002196354052, + "grad_norm": 2.253216028213501, + "learning_rate": 1e-06, + "loss": 0.83, + "mean_token_accuracy": 0.7324292659759521, + "num_tokens": 180677290.0, + "step": 7130 + }, + { + "epoch": 0.7831100373380189, + "grad_norm": 2.1445066928863525, + "learning_rate": 1e-06, + "loss": 1.0218, + "mean_token_accuracy": 0.6938051581382751, + "num_tokens": 180704633.0, + "step": 7131 + }, + { + "epoch": 0.7832198550406325, + "grad_norm": 2.2312066555023193, + "learning_rate": 1e-06, + "loss": 0.9874, + "mean_token_accuracy": 0.7052396535873413, + "num_tokens": 180731045.0, + "step": 7132 + }, + { + "epoch": 0.7833296727432462, + "grad_norm": 2.0526769161224365, + "learning_rate": 1e-06, + "loss": 0.9909, + "mean_token_accuracy": 0.6985551118850708, + "num_tokens": 180759643.0, + "step": 7133 + }, + { + "epoch": 0.7834394904458599, + "grad_norm": 2.262282371520996, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7139164209365845, + "num_tokens": 180784331.0, + "step": 7134 + }, + { + "epoch": 0.7835493081484736, + "grad_norm": 2.3738324642181396, + "learning_rate": 1e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.7159295082092285, + "num_tokens": 180805422.0, + "step": 7135 + }, + { + "epoch": 0.7836591258510872, + "grad_norm": 2.3427178859710693, + "learning_rate": 1e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.6885263919830322, + "num_tokens": 180826796.0, + "step": 7136 + }, + { + "epoch": 0.7837689435537009, + "grad_norm": 2.0387840270996094, + "learning_rate": 1e-06, + "loss": 1.0536, + "mean_token_accuracy": 0.6888878345489502, + "num_tokens": 180856953.0, + "step": 7137 + }, + { + "epoch": 0.7838787612563145, + "grad_norm": 2.5517048835754395, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.7059438824653625, + "num_tokens": 180879600.0, + "step": 7138 + }, + { + "epoch": 0.7839885789589282, + "grad_norm": 2.341531991958618, + "learning_rate": 1e-06, + "loss": 0.8501, + "mean_token_accuracy": 0.7366990447044373, + "num_tokens": 180899995.0, + "step": 7139 + }, + { + "epoch": 0.7840983966615418, + "grad_norm": 2.1266088485717773, + "learning_rate": 1e-06, + "loss": 0.9117, + "mean_token_accuracy": 0.7288448810577393, + "num_tokens": 180925520.0, + "step": 7140 + }, + { + "epoch": 0.7842082143641556, + "grad_norm": 1.946272373199463, + "learning_rate": 1e-06, + "loss": 1.0423, + "mean_token_accuracy": 0.690293550491333, + "num_tokens": 180958015.0, + "step": 7141 + }, + { + "epoch": 0.7843180320667692, + "grad_norm": 2.0726678371429443, + "learning_rate": 1e-06, + "loss": 0.9614, + "mean_token_accuracy": 0.7104359865188599, + "num_tokens": 180984509.0, + "step": 7142 + }, + { + "epoch": 0.7844278497693828, + "grad_norm": 2.1312122344970703, + "learning_rate": 1e-06, + "loss": 0.9972, + "mean_token_accuracy": 0.6953835487365723, + "num_tokens": 181012157.0, + "step": 7143 + }, + { + "epoch": 0.7845376674719965, + "grad_norm": 2.1948652267456055, + "learning_rate": 1e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7112782001495361, + "num_tokens": 181036615.0, + "step": 7144 + }, + { + "epoch": 0.7846474851746101, + "grad_norm": 2.0549256801605225, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.7054787278175354, + "num_tokens": 181063943.0, + "step": 7145 + }, + { + "epoch": 0.7847573028772238, + "grad_norm": 2.356186628341675, + "learning_rate": 1e-06, + "loss": 0.9936, + "mean_token_accuracy": 0.7031598091125488, + "num_tokens": 181086639.0, + "step": 7146 + }, + { + "epoch": 0.7848671205798374, + "grad_norm": 2.396578073501587, + "learning_rate": 1e-06, + "loss": 0.9672, + "mean_token_accuracy": 0.7114342451095581, + "num_tokens": 181111315.0, + "step": 7147 + }, + { + "epoch": 0.7849769382824512, + "grad_norm": 2.2909600734710693, + "learning_rate": 1e-06, + "loss": 0.991, + "mean_token_accuracy": 0.7099113464355469, + "num_tokens": 181135377.0, + "step": 7148 + }, + { + "epoch": 0.7850867559850648, + "grad_norm": 1.9408679008483887, + "learning_rate": 1e-06, + "loss": 1.0019, + "mean_token_accuracy": 0.7085403203964233, + "num_tokens": 181165099.0, + "step": 7149 + }, + { + "epoch": 0.7851965736876785, + "grad_norm": 2.12882137298584, + "learning_rate": 1e-06, + "loss": 0.9969, + "mean_token_accuracy": 0.7015142440795898, + "num_tokens": 181192635.0, + "step": 7150 + }, + { + "epoch": 0.7853063913902921, + "grad_norm": 2.178816318511963, + "learning_rate": 1e-06, + "loss": 1.023, + "mean_token_accuracy": 0.686036229133606, + "num_tokens": 181218294.0, + "step": 7151 + }, + { + "epoch": 0.7854162090929058, + "grad_norm": 2.343216896057129, + "learning_rate": 1e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.7197444438934326, + "num_tokens": 181240408.0, + "step": 7152 + }, + { + "epoch": 0.7855260267955194, + "grad_norm": 2.022611141204834, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.7049555778503418, + "num_tokens": 181268713.0, + "step": 7153 + }, + { + "epoch": 0.7856358444981331, + "grad_norm": 2.1501762866973877, + "learning_rate": 1e-06, + "loss": 0.9944, + "mean_token_accuracy": 0.7043758630752563, + "num_tokens": 181296555.0, + "step": 7154 + }, + { + "epoch": 0.7857456622007467, + "grad_norm": 2.5101728439331055, + "learning_rate": 1e-06, + "loss": 1.0378, + "mean_token_accuracy": 0.6892701983451843, + "num_tokens": 181317565.0, + "step": 7155 + }, + { + "epoch": 0.7858554799033605, + "grad_norm": 2.2389872074127197, + "learning_rate": 1e-06, + "loss": 0.9628, + "mean_token_accuracy": 0.7146925926208496, + "num_tokens": 181342458.0, + "step": 7156 + }, + { + "epoch": 0.7859652976059741, + "grad_norm": 2.182422399520874, + "learning_rate": 1e-06, + "loss": 0.991, + "mean_token_accuracy": 0.7025827765464783, + "num_tokens": 181368754.0, + "step": 7157 + }, + { + "epoch": 0.7860751153085878, + "grad_norm": 2.1792407035827637, + "learning_rate": 1e-06, + "loss": 0.7953, + "mean_token_accuracy": 0.7530575394630432, + "num_tokens": 181390210.0, + "step": 7158 + }, + { + "epoch": 0.7861849330112014, + "grad_norm": 2.604257106781006, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7026662826538086, + "num_tokens": 181410146.0, + "step": 7159 + }, + { + "epoch": 0.786294750713815, + "grad_norm": 2.351266384124756, + "learning_rate": 1e-06, + "loss": 0.9025, + "mean_token_accuracy": 0.7240354418754578, + "num_tokens": 181433116.0, + "step": 7160 + }, + { + "epoch": 0.7864045684164287, + "grad_norm": 2.134964942932129, + "learning_rate": 1e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.6956450939178467, + "num_tokens": 181458630.0, + "step": 7161 + }, + { + "epoch": 0.7865143861190423, + "grad_norm": 2.132174491882324, + "learning_rate": 1e-06, + "loss": 0.8641, + "mean_token_accuracy": 0.735240638256073, + "num_tokens": 181482596.0, + "step": 7162 + }, + { + "epoch": 0.7866242038216561, + "grad_norm": 2.3251893520355225, + "learning_rate": 1e-06, + "loss": 1.0382, + "mean_token_accuracy": 0.6831192374229431, + "num_tokens": 181508021.0, + "step": 7163 + }, + { + "epoch": 0.7867340215242697, + "grad_norm": 2.0621914863586426, + "learning_rate": 1e-06, + "loss": 1.0028, + "mean_token_accuracy": 0.6981143355369568, + "num_tokens": 181537915.0, + "step": 7164 + }, + { + "epoch": 0.7868438392268834, + "grad_norm": 2.0631253719329834, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7096470594406128, + "num_tokens": 181565591.0, + "step": 7165 + }, + { + "epoch": 0.786953656929497, + "grad_norm": 2.0857441425323486, + "learning_rate": 1e-06, + "loss": 0.8849, + "mean_token_accuracy": 0.7248178720474243, + "num_tokens": 181590528.0, + "step": 7166 + }, + { + "epoch": 0.7870634746321107, + "grad_norm": 2.4418704509735107, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7066500782966614, + "num_tokens": 181613826.0, + "step": 7167 + }, + { + "epoch": 0.7871732923347243, + "grad_norm": 2.403047800064087, + "learning_rate": 1e-06, + "loss": 0.8272, + "mean_token_accuracy": 0.7391901016235352, + "num_tokens": 181634452.0, + "step": 7168 + }, + { + "epoch": 0.787283110037338, + "grad_norm": 1.9134100675582886, + "learning_rate": 1e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.7053236961364746, + "num_tokens": 181665460.0, + "step": 7169 + }, + { + "epoch": 0.7873929277399517, + "grad_norm": 2.111344337463379, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7079723477363586, + "num_tokens": 181692382.0, + "step": 7170 + }, + { + "epoch": 0.7875027454425654, + "grad_norm": 2.2650997638702393, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.7130839824676514, + "num_tokens": 181715754.0, + "step": 7171 + }, + { + "epoch": 0.787612563145179, + "grad_norm": 2.552922487258911, + "learning_rate": 1e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7237032651901245, + "num_tokens": 181734357.0, + "step": 7172 + }, + { + "epoch": 0.7877223808477927, + "grad_norm": 2.0420100688934326, + "learning_rate": 1e-06, + "loss": 0.9946, + "mean_token_accuracy": 0.7025678157806396, + "num_tokens": 181762246.0, + "step": 7173 + }, + { + "epoch": 0.7878321985504063, + "grad_norm": 2.207376718521118, + "learning_rate": 1e-06, + "loss": 1.0085, + "mean_token_accuracy": 0.700725793838501, + "num_tokens": 181787691.0, + "step": 7174 + }, + { + "epoch": 0.78794201625302, + "grad_norm": 2.550811290740967, + "learning_rate": 1e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.6987099647521973, + "num_tokens": 181809033.0, + "step": 7175 + }, + { + "epoch": 0.7880518339556336, + "grad_norm": 2.3948609828948975, + "learning_rate": 1e-06, + "loss": 1.0868, + "mean_token_accuracy": 0.669638991355896, + "num_tokens": 181833122.0, + "step": 7176 + }, + { + "epoch": 0.7881616516582474, + "grad_norm": 2.4256508350372314, + "learning_rate": 1e-06, + "loss": 0.967, + "mean_token_accuracy": 0.7094806432723999, + "num_tokens": 181854432.0, + "step": 7177 + }, + { + "epoch": 0.788271469360861, + "grad_norm": 2.158592939376831, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.711669921875, + "num_tokens": 181880136.0, + "step": 7178 + }, + { + "epoch": 0.7883812870634747, + "grad_norm": 2.269385576248169, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.6973320245742798, + "num_tokens": 181907304.0, + "step": 7179 + }, + { + "epoch": 0.7884911047660883, + "grad_norm": 2.3636505603790283, + "learning_rate": 1e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.7092591524124146, + "num_tokens": 181930186.0, + "step": 7180 + }, + { + "epoch": 0.788600922468702, + "grad_norm": 2.0386321544647217, + "learning_rate": 1e-06, + "loss": 0.8744, + "mean_token_accuracy": 0.7293519973754883, + "num_tokens": 181954832.0, + "step": 7181 + }, + { + "epoch": 0.7887107401713156, + "grad_norm": 2.193171977996826, + "learning_rate": 1e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.7127902507781982, + "num_tokens": 181979181.0, + "step": 7182 + }, + { + "epoch": 0.7888205578739292, + "grad_norm": 2.326378345489502, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.7035831212997437, + "num_tokens": 182001765.0, + "step": 7183 + }, + { + "epoch": 0.7889303755765429, + "grad_norm": 2.492583990097046, + "learning_rate": 1e-06, + "loss": 0.9341, + "mean_token_accuracy": 0.7096125483512878, + "num_tokens": 182023553.0, + "step": 7184 + }, + { + "epoch": 0.7890401932791566, + "grad_norm": 2.3333258628845215, + "learning_rate": 1e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.6961644291877747, + "num_tokens": 182048052.0, + "step": 7185 + }, + { + "epoch": 0.7891500109817703, + "grad_norm": 1.8830435276031494, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.7016072869300842, + "num_tokens": 182079301.0, + "step": 7186 + }, + { + "epoch": 0.7892598286843839, + "grad_norm": 2.096214771270752, + "learning_rate": 1e-06, + "loss": 0.94, + "mean_token_accuracy": 0.7120164632797241, + "num_tokens": 182105903.0, + "step": 7187 + }, + { + "epoch": 0.7893696463869976, + "grad_norm": 2.4542248249053955, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7135970592498779, + "num_tokens": 182126492.0, + "step": 7188 + }, + { + "epoch": 0.7894794640896112, + "grad_norm": 2.3471927642822266, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.71811842918396, + "num_tokens": 182149707.0, + "step": 7189 + }, + { + "epoch": 0.7895892817922249, + "grad_norm": 2.232553720474243, + "learning_rate": 1e-06, + "loss": 1.0445, + "mean_token_accuracy": 0.6927762031555176, + "num_tokens": 182174929.0, + "step": 7190 + }, + { + "epoch": 0.7896990994948385, + "grad_norm": 2.366917371749878, + "learning_rate": 1e-06, + "loss": 0.9485, + "mean_token_accuracy": 0.7066053152084351, + "num_tokens": 182197368.0, + "step": 7191 + }, + { + "epoch": 0.7898089171974523, + "grad_norm": 1.9969887733459473, + "learning_rate": 1e-06, + "loss": 0.9552, + "mean_token_accuracy": 0.7094494104385376, + "num_tokens": 182228141.0, + "step": 7192 + }, + { + "epoch": 0.7899187349000659, + "grad_norm": 1.9710832834243774, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.704967200756073, + "num_tokens": 182256851.0, + "step": 7193 + }, + { + "epoch": 0.7900285526026796, + "grad_norm": 2.2451720237731934, + "learning_rate": 1e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.728705644607544, + "num_tokens": 182279496.0, + "step": 7194 + }, + { + "epoch": 0.7901383703052932, + "grad_norm": 1.9565834999084473, + "learning_rate": 1e-06, + "loss": 1.0025, + "mean_token_accuracy": 0.7016122341156006, + "num_tokens": 182311113.0, + "step": 7195 + }, + { + "epoch": 0.7902481880079069, + "grad_norm": 2.5661022663116455, + "learning_rate": 1e-06, + "loss": 0.8843, + "mean_token_accuracy": 0.7327525019645691, + "num_tokens": 182331666.0, + "step": 7196 + }, + { + "epoch": 0.7903580057105205, + "grad_norm": 1.961960792541504, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7169776558876038, + "num_tokens": 182360581.0, + "step": 7197 + }, + { + "epoch": 0.7904678234131342, + "grad_norm": 2.287184000015259, + "learning_rate": 1e-06, + "loss": 0.9105, + "mean_token_accuracy": 0.717340350151062, + "num_tokens": 182383005.0, + "step": 7198 + }, + { + "epoch": 0.7905776411157479, + "grad_norm": 2.242499589920044, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7145897150039673, + "num_tokens": 182407467.0, + "step": 7199 + }, + { + "epoch": 0.7906874588183616, + "grad_norm": 2.2812023162841797, + "learning_rate": 1e-06, + "loss": 0.8641, + "mean_token_accuracy": 0.7243844866752625, + "num_tokens": 182430427.0, + "step": 7200 + }, + { + "epoch": 0.7907972765209752, + "grad_norm": 2.312058925628662, + "learning_rate": 1e-06, + "loss": 0.8816, + "mean_token_accuracy": 0.7285785675048828, + "num_tokens": 182453463.0, + "step": 7201 + }, + { + "epoch": 0.7909070942235888, + "grad_norm": 2.0680336952209473, + "learning_rate": 1e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.7135024666786194, + "num_tokens": 182479022.0, + "step": 7202 + }, + { + "epoch": 0.7910169119262025, + "grad_norm": 2.097317934036255, + "learning_rate": 1e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7229036688804626, + "num_tokens": 182503261.0, + "step": 7203 + }, + { + "epoch": 0.7911267296288161, + "grad_norm": 2.3008668422698975, + "learning_rate": 1e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.7105995416641235, + "num_tokens": 182527017.0, + "step": 7204 + }, + { + "epoch": 0.7912365473314298, + "grad_norm": 2.3047218322753906, + "learning_rate": 1e-06, + "loss": 1.0206, + "mean_token_accuracy": 0.6908208131790161, + "num_tokens": 182550601.0, + "step": 7205 + }, + { + "epoch": 0.7913463650340435, + "grad_norm": 2.0485270023345947, + "learning_rate": 1e-06, + "loss": 0.9024, + "mean_token_accuracy": 0.7229782342910767, + "num_tokens": 182575763.0, + "step": 7206 + }, + { + "epoch": 0.7914561827366572, + "grad_norm": 2.190443515777588, + "learning_rate": 1e-06, + "loss": 1.0383, + "mean_token_accuracy": 0.6870999336242676, + "num_tokens": 182602041.0, + "step": 7207 + }, + { + "epoch": 0.7915660004392708, + "grad_norm": 2.2640061378479004, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7129548788070679, + "num_tokens": 182627053.0, + "step": 7208 + }, + { + "epoch": 0.7916758181418845, + "grad_norm": 2.2805325984954834, + "learning_rate": 1e-06, + "loss": 1.016, + "mean_token_accuracy": 0.6916441321372986, + "num_tokens": 182652465.0, + "step": 7209 + }, + { + "epoch": 0.7917856358444981, + "grad_norm": 2.322970390319824, + "learning_rate": 1e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7253055572509766, + "num_tokens": 182673782.0, + "step": 7210 + }, + { + "epoch": 0.7918954535471118, + "grad_norm": 2.2468769550323486, + "learning_rate": 1e-06, + "loss": 0.9128, + "mean_token_accuracy": 0.7223303914070129, + "num_tokens": 182698545.0, + "step": 7211 + }, + { + "epoch": 0.7920052712497254, + "grad_norm": 2.2433130741119385, + "learning_rate": 1e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7184919714927673, + "num_tokens": 182721866.0, + "step": 7212 + }, + { + "epoch": 0.7921150889523391, + "grad_norm": 2.012657642364502, + "learning_rate": 1e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.7191934585571289, + "num_tokens": 182749004.0, + "step": 7213 + }, + { + "epoch": 0.7922249066549528, + "grad_norm": 2.206432342529297, + "learning_rate": 1e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.7166255116462708, + "num_tokens": 182773042.0, + "step": 7214 + }, + { + "epoch": 0.7923347243575665, + "grad_norm": 2.4087421894073486, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7139971852302551, + "num_tokens": 182793796.0, + "step": 7215 + }, + { + "epoch": 0.7924445420601801, + "grad_norm": 2.3680419921875, + "learning_rate": 1e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.689624547958374, + "num_tokens": 182815481.0, + "step": 7216 + }, + { + "epoch": 0.7925543597627938, + "grad_norm": 1.9539531469345093, + "learning_rate": 1e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7174748182296753, + "num_tokens": 182843916.0, + "step": 7217 + }, + { + "epoch": 0.7926641774654074, + "grad_norm": 2.214151382446289, + "learning_rate": 1e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.696861982345581, + "num_tokens": 182868149.0, + "step": 7218 + }, + { + "epoch": 0.792773995168021, + "grad_norm": 1.8906834125518799, + "learning_rate": 1e-06, + "loss": 1.0102, + "mean_token_accuracy": 0.6930564641952515, + "num_tokens": 182902962.0, + "step": 7219 + }, + { + "epoch": 0.7928838128706347, + "grad_norm": 2.061971426010132, + "learning_rate": 1e-06, + "loss": 0.9958, + "mean_token_accuracy": 0.6939147710800171, + "num_tokens": 182930822.0, + "step": 7220 + }, + { + "epoch": 0.7929936305732485, + "grad_norm": 1.9795658588409424, + "learning_rate": 1e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.7109184265136719, + "num_tokens": 182960419.0, + "step": 7221 + }, + { + "epoch": 0.7931034482758621, + "grad_norm": 2.0451807975769043, + "learning_rate": 1e-06, + "loss": 1.0095, + "mean_token_accuracy": 0.7030737400054932, + "num_tokens": 182991082.0, + "step": 7222 + }, + { + "epoch": 0.7932132659784757, + "grad_norm": 2.5958621501922607, + "learning_rate": 1e-06, + "loss": 0.9191, + "mean_token_accuracy": 0.7164766788482666, + "num_tokens": 183011477.0, + "step": 7223 + }, + { + "epoch": 0.7933230836810894, + "grad_norm": 2.108996868133545, + "learning_rate": 1e-06, + "loss": 1.0276, + "mean_token_accuracy": 0.6848621368408203, + "num_tokens": 183041639.0, + "step": 7224 + }, + { + "epoch": 0.793432901383703, + "grad_norm": 2.3020715713500977, + "learning_rate": 1e-06, + "loss": 0.8949, + "mean_token_accuracy": 0.7161519527435303, + "num_tokens": 183063914.0, + "step": 7225 + }, + { + "epoch": 0.7935427190863167, + "grad_norm": 2.531334400177002, + "learning_rate": 1e-06, + "loss": 0.943, + "mean_token_accuracy": 0.7140974998474121, + "num_tokens": 183083582.0, + "step": 7226 + }, + { + "epoch": 0.7936525367889303, + "grad_norm": 2.2394323348999023, + "learning_rate": 1e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.7020958662033081, + "num_tokens": 183108776.0, + "step": 7227 + }, + { + "epoch": 0.7937623544915441, + "grad_norm": 2.276560068130493, + "learning_rate": 1e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.7048175930976868, + "num_tokens": 183134617.0, + "step": 7228 + }, + { + "epoch": 0.7938721721941577, + "grad_norm": 2.207041025161743, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7109410166740417, + "num_tokens": 183158706.0, + "step": 7229 + }, + { + "epoch": 0.7939819898967714, + "grad_norm": 2.1818041801452637, + "learning_rate": 1e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.6910948753356934, + "num_tokens": 183184571.0, + "step": 7230 + }, + { + "epoch": 0.794091807599385, + "grad_norm": 2.5441174507141113, + "learning_rate": 1e-06, + "loss": 0.8826, + "mean_token_accuracy": 0.7292906641960144, + "num_tokens": 183204223.0, + "step": 7231 + }, + { + "epoch": 0.7942016253019987, + "grad_norm": 2.03190541267395, + "learning_rate": 1e-06, + "loss": 1.0273, + "mean_token_accuracy": 0.7064757943153381, + "num_tokens": 183233441.0, + "step": 7232 + }, + { + "epoch": 0.7943114430046123, + "grad_norm": 2.2928781509399414, + "learning_rate": 1e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.7289746999740601, + "num_tokens": 183256610.0, + "step": 7233 + }, + { + "epoch": 0.794421260707226, + "grad_norm": 2.093282699584961, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.6994508504867554, + "num_tokens": 183282362.0, + "step": 7234 + }, + { + "epoch": 0.7945310784098397, + "grad_norm": 2.1275079250335693, + "learning_rate": 1e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.6986410021781921, + "num_tokens": 183308918.0, + "step": 7235 + }, + { + "epoch": 0.7946408961124534, + "grad_norm": 2.1851401329040527, + "learning_rate": 1e-06, + "loss": 0.8964, + "mean_token_accuracy": 0.7220218181610107, + "num_tokens": 183337262.0, + "step": 7236 + }, + { + "epoch": 0.794750713815067, + "grad_norm": 2.4611575603485107, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7168322205543518, + "num_tokens": 183360471.0, + "step": 7237 + }, + { + "epoch": 0.7948605315176807, + "grad_norm": 1.9420669078826904, + "learning_rate": 1e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7084004878997803, + "num_tokens": 183391436.0, + "step": 7238 + }, + { + "epoch": 0.7949703492202943, + "grad_norm": 2.017303943634033, + "learning_rate": 1e-06, + "loss": 0.9075, + "mean_token_accuracy": 0.7174919843673706, + "num_tokens": 183421093.0, + "step": 7239 + }, + { + "epoch": 0.795080166922908, + "grad_norm": 2.228647470474243, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7091171145439148, + "num_tokens": 183448575.0, + "step": 7240 + }, + { + "epoch": 0.7951899846255216, + "grad_norm": 2.598243474960327, + "learning_rate": 1e-06, + "loss": 0.8762, + "mean_token_accuracy": 0.7359750270843506, + "num_tokens": 183466150.0, + "step": 7241 + }, + { + "epoch": 0.7952998023281352, + "grad_norm": 2.304722785949707, + "learning_rate": 1e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.7234429121017456, + "num_tokens": 183488016.0, + "step": 7242 + }, + { + "epoch": 0.795409620030749, + "grad_norm": 2.3249270915985107, + "learning_rate": 1e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.7049975395202637, + "num_tokens": 183511103.0, + "step": 7243 + }, + { + "epoch": 0.7955194377333626, + "grad_norm": 2.391700267791748, + "learning_rate": 1e-06, + "loss": 1.0579, + "mean_token_accuracy": 0.6748298406600952, + "num_tokens": 183533940.0, + "step": 7244 + }, + { + "epoch": 0.7956292554359763, + "grad_norm": 2.2451822757720947, + "learning_rate": 1e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7151666879653931, + "num_tokens": 183559538.0, + "step": 7245 + }, + { + "epoch": 0.7957390731385899, + "grad_norm": 2.1580758094787598, + "learning_rate": 1e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.711098313331604, + "num_tokens": 183584356.0, + "step": 7246 + }, + { + "epoch": 0.7958488908412036, + "grad_norm": 2.0276424884796143, + "learning_rate": 1e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.6991006135940552, + "num_tokens": 183613109.0, + "step": 7247 + }, + { + "epoch": 0.7959587085438172, + "grad_norm": 2.1499850749969482, + "learning_rate": 1e-06, + "loss": 0.8839, + "mean_token_accuracy": 0.7272055149078369, + "num_tokens": 183637441.0, + "step": 7248 + }, + { + "epoch": 0.7960685262464309, + "grad_norm": 2.3741915225982666, + "learning_rate": 1e-06, + "loss": 0.8324, + "mean_token_accuracy": 0.7400484681129456, + "num_tokens": 183656963.0, + "step": 7249 + }, + { + "epoch": 0.7961783439490446, + "grad_norm": 2.220367193222046, + "learning_rate": 1e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.7054727077484131, + "num_tokens": 183680697.0, + "step": 7250 + }, + { + "epoch": 0.7962881616516583, + "grad_norm": 2.01181960105896, + "learning_rate": 1e-06, + "loss": 1.0098, + "mean_token_accuracy": 0.6964321136474609, + "num_tokens": 183711113.0, + "step": 7251 + }, + { + "epoch": 0.7963979793542719, + "grad_norm": 2.1064748764038086, + "learning_rate": 1e-06, + "loss": 0.989, + "mean_token_accuracy": 0.6994568109512329, + "num_tokens": 183738292.0, + "step": 7252 + }, + { + "epoch": 0.7965077970568856, + "grad_norm": 1.9626061916351318, + "learning_rate": 1e-06, + "loss": 0.9786, + "mean_token_accuracy": 0.701034426689148, + "num_tokens": 183768255.0, + "step": 7253 + }, + { + "epoch": 0.7966176147594992, + "grad_norm": 2.2302772998809814, + "learning_rate": 1e-06, + "loss": 1.0305, + "mean_token_accuracy": 0.6847388744354248, + "num_tokens": 183793254.0, + "step": 7254 + }, + { + "epoch": 0.7967274324621129, + "grad_norm": 2.571712017059326, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.706939697265625, + "num_tokens": 183812496.0, + "step": 7255 + }, + { + "epoch": 0.7968372501647265, + "grad_norm": 2.0704593658447266, + "learning_rate": 1e-06, + "loss": 0.9736, + "mean_token_accuracy": 0.7035135626792908, + "num_tokens": 183840044.0, + "step": 7256 + }, + { + "epoch": 0.7969470678673403, + "grad_norm": 2.1381616592407227, + "learning_rate": 1e-06, + "loss": 0.8411, + "mean_token_accuracy": 0.740741491317749, + "num_tokens": 183863640.0, + "step": 7257 + }, + { + "epoch": 0.7970568855699539, + "grad_norm": 1.8051358461380005, + "learning_rate": 1e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.6925328969955444, + "num_tokens": 183899228.0, + "step": 7258 + }, + { + "epoch": 0.7971667032725676, + "grad_norm": 2.0866708755493164, + "learning_rate": 1e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.6955810785293579, + "num_tokens": 183926977.0, + "step": 7259 + }, + { + "epoch": 0.7972765209751812, + "grad_norm": 2.195770025253296, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.7063812017440796, + "num_tokens": 183952781.0, + "step": 7260 + }, + { + "epoch": 0.7973863386777948, + "grad_norm": 2.2159981727600098, + "learning_rate": 1e-06, + "loss": 0.893, + "mean_token_accuracy": 0.729339063167572, + "num_tokens": 183978028.0, + "step": 7261 + }, + { + "epoch": 0.7974961563804085, + "grad_norm": 2.1291332244873047, + "learning_rate": 1e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.6977017521858215, + "num_tokens": 184004151.0, + "step": 7262 + }, + { + "epoch": 0.7976059740830221, + "grad_norm": 2.2794225215911865, + "learning_rate": 1e-06, + "loss": 1.0521, + "mean_token_accuracy": 0.6843644380569458, + "num_tokens": 184028712.0, + "step": 7263 + }, + { + "epoch": 0.7977157917856359, + "grad_norm": 1.9412176609039307, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7092161178588867, + "num_tokens": 184059115.0, + "step": 7264 + }, + { + "epoch": 0.7978256094882495, + "grad_norm": 1.9653253555297852, + "learning_rate": 1e-06, + "loss": 0.9249, + "mean_token_accuracy": 0.7149739265441895, + "num_tokens": 184088455.0, + "step": 7265 + }, + { + "epoch": 0.7979354271908632, + "grad_norm": 2.365604877471924, + "learning_rate": 1e-06, + "loss": 0.8579, + "mean_token_accuracy": 0.7312705516815186, + "num_tokens": 184109847.0, + "step": 7266 + }, + { + "epoch": 0.7980452448934768, + "grad_norm": 2.1334826946258545, + "learning_rate": 1e-06, + "loss": 0.968, + "mean_token_accuracy": 0.6980537176132202, + "num_tokens": 184136281.0, + "step": 7267 + }, + { + "epoch": 0.7981550625960905, + "grad_norm": 2.24214506149292, + "learning_rate": 1e-06, + "loss": 1.1052, + "mean_token_accuracy": 0.6633004546165466, + "num_tokens": 184163228.0, + "step": 7268 + }, + { + "epoch": 0.7982648802987041, + "grad_norm": 2.691934108734131, + "learning_rate": 1e-06, + "loss": 0.9032, + "mean_token_accuracy": 0.7173951864242554, + "num_tokens": 184181028.0, + "step": 7269 + }, + { + "epoch": 0.7983746980013178, + "grad_norm": 2.1183528900146484, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.697043776512146, + "num_tokens": 184206949.0, + "step": 7270 + }, + { + "epoch": 0.7984845157039314, + "grad_norm": 2.5615622997283936, + "learning_rate": 1e-06, + "loss": 0.8617, + "mean_token_accuracy": 0.7246424555778503, + "num_tokens": 184225550.0, + "step": 7271 + }, + { + "epoch": 0.7985943334065452, + "grad_norm": 2.4958415031433105, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7364617586135864, + "num_tokens": 184244841.0, + "step": 7272 + }, + { + "epoch": 0.7987041511091588, + "grad_norm": 2.195587396621704, + "learning_rate": 1e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.7142631411552429, + "num_tokens": 184270866.0, + "step": 7273 + }, + { + "epoch": 0.7988139688117725, + "grad_norm": 2.3349993228912354, + "learning_rate": 1e-06, + "loss": 0.8907, + "mean_token_accuracy": 0.7281994819641113, + "num_tokens": 184292208.0, + "step": 7274 + }, + { + "epoch": 0.7989237865143861, + "grad_norm": 2.3984947204589844, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7272738218307495, + "num_tokens": 184312690.0, + "step": 7275 + }, + { + "epoch": 0.7990336042169998, + "grad_norm": 2.047980546951294, + "learning_rate": 1e-06, + "loss": 1.0297, + "mean_token_accuracy": 0.6846555471420288, + "num_tokens": 184342159.0, + "step": 7276 + }, + { + "epoch": 0.7991434219196134, + "grad_norm": 2.044766664505005, + "learning_rate": 1e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7125775814056396, + "num_tokens": 184368474.0, + "step": 7277 + }, + { + "epoch": 0.799253239622227, + "grad_norm": 2.246253252029419, + "learning_rate": 1e-06, + "loss": 0.9073, + "mean_token_accuracy": 0.7158763408660889, + "num_tokens": 184391745.0, + "step": 7278 + }, + { + "epoch": 0.7993630573248408, + "grad_norm": 2.1953063011169434, + "learning_rate": 1e-06, + "loss": 0.9992, + "mean_token_accuracy": 0.6936637163162231, + "num_tokens": 184418360.0, + "step": 7279 + }, + { + "epoch": 0.7994728750274545, + "grad_norm": 2.2509055137634277, + "learning_rate": 1e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.7012937664985657, + "num_tokens": 184441042.0, + "step": 7280 + }, + { + "epoch": 0.7995826927300681, + "grad_norm": 2.232758045196533, + "learning_rate": 1e-06, + "loss": 0.8816, + "mean_token_accuracy": 0.7237465381622314, + "num_tokens": 184463071.0, + "step": 7281 + }, + { + "epoch": 0.7996925104326817, + "grad_norm": 2.088531732559204, + "learning_rate": 1e-06, + "loss": 0.8668, + "mean_token_accuracy": 0.7315936088562012, + "num_tokens": 184489775.0, + "step": 7282 + }, + { + "epoch": 0.7998023281352954, + "grad_norm": 2.34116792678833, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7146453261375427, + "num_tokens": 184512854.0, + "step": 7283 + }, + { + "epoch": 0.799912145837909, + "grad_norm": 2.130774974822998, + "learning_rate": 1e-06, + "loss": 0.973, + "mean_token_accuracy": 0.7064810991287231, + "num_tokens": 184539506.0, + "step": 7284 + }, + { + "epoch": 0.8000219635405227, + "grad_norm": 1.9685899019241333, + "learning_rate": 1e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.695196270942688, + "num_tokens": 184568925.0, + "step": 7285 + }, + { + "epoch": 0.8001317812431364, + "grad_norm": 2.231313467025757, + "learning_rate": 1e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.7043244242668152, + "num_tokens": 184593385.0, + "step": 7286 + }, + { + "epoch": 0.8002415989457501, + "grad_norm": 2.1675286293029785, + "learning_rate": 1e-06, + "loss": 1.0017, + "mean_token_accuracy": 0.7004513740539551, + "num_tokens": 184618410.0, + "step": 7287 + }, + { + "epoch": 0.8003514166483637, + "grad_norm": 2.6207520961761475, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7107498645782471, + "num_tokens": 184636496.0, + "step": 7288 + }, + { + "epoch": 0.8004612343509774, + "grad_norm": 2.207655191421509, + "learning_rate": 1e-06, + "loss": 0.8651, + "mean_token_accuracy": 0.7339629530906677, + "num_tokens": 184660077.0, + "step": 7289 + }, + { + "epoch": 0.800571052053591, + "grad_norm": 2.2540531158447266, + "learning_rate": 1e-06, + "loss": 0.9969, + "mean_token_accuracy": 0.6969489455223083, + "num_tokens": 184683339.0, + "step": 7290 + }, + { + "epoch": 0.8006808697562047, + "grad_norm": 2.131593942642212, + "learning_rate": 1e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7160072326660156, + "num_tokens": 184710519.0, + "step": 7291 + }, + { + "epoch": 0.8007906874588183, + "grad_norm": 1.901735782623291, + "learning_rate": 1e-06, + "loss": 0.998, + "mean_token_accuracy": 0.6919891834259033, + "num_tokens": 184742774.0, + "step": 7292 + }, + { + "epoch": 0.8009005051614321, + "grad_norm": 2.2730395793914795, + "learning_rate": 1e-06, + "loss": 0.906, + "mean_token_accuracy": 0.71653151512146, + "num_tokens": 184765724.0, + "step": 7293 + }, + { + "epoch": 0.8010103228640457, + "grad_norm": 1.961295485496521, + "learning_rate": 1e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.7051224112510681, + "num_tokens": 184798360.0, + "step": 7294 + }, + { + "epoch": 0.8011201405666594, + "grad_norm": 1.8790016174316406, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.6974302530288696, + "num_tokens": 184831667.0, + "step": 7295 + }, + { + "epoch": 0.801229958269273, + "grad_norm": 2.026627779006958, + "learning_rate": 1e-06, + "loss": 0.866, + "mean_token_accuracy": 0.739793062210083, + "num_tokens": 184859137.0, + "step": 7296 + }, + { + "epoch": 0.8013397759718867, + "grad_norm": 2.132866859436035, + "learning_rate": 1e-06, + "loss": 1.0416, + "mean_token_accuracy": 0.683169960975647, + "num_tokens": 184887388.0, + "step": 7297 + }, + { + "epoch": 0.8014495936745003, + "grad_norm": 2.406667947769165, + "learning_rate": 1e-06, + "loss": 0.8768, + "mean_token_accuracy": 0.7229572534561157, + "num_tokens": 184908293.0, + "step": 7298 + }, + { + "epoch": 0.801559411377114, + "grad_norm": 2.1838347911834717, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7106886506080627, + "num_tokens": 184932015.0, + "step": 7299 + }, + { + "epoch": 0.8016692290797276, + "grad_norm": 1.9243155717849731, + "learning_rate": 1e-06, + "loss": 1.0457, + "mean_token_accuracy": 0.6869277954101562, + "num_tokens": 184964087.0, + "step": 7300 + }, + { + "epoch": 0.8017790467823414, + "grad_norm": 2.1303372383117676, + "learning_rate": 1e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.7030821442604065, + "num_tokens": 184991292.0, + "step": 7301 + }, + { + "epoch": 0.801888864484955, + "grad_norm": 2.1740877628326416, + "learning_rate": 1e-06, + "loss": 1.0487, + "mean_token_accuracy": 0.6803881525993347, + "num_tokens": 185018496.0, + "step": 7302 + }, + { + "epoch": 0.8019986821875686, + "grad_norm": 2.3311548233032227, + "learning_rate": 1e-06, + "loss": 0.9506, + "mean_token_accuracy": 0.7053210735321045, + "num_tokens": 185042309.0, + "step": 7303 + }, + { + "epoch": 0.8021084998901823, + "grad_norm": 2.2887580394744873, + "learning_rate": 1e-06, + "loss": 1.0574, + "mean_token_accuracy": 0.6818101406097412, + "num_tokens": 185065882.0, + "step": 7304 + }, + { + "epoch": 0.8022183175927959, + "grad_norm": 2.5841989517211914, + "learning_rate": 1e-06, + "loss": 0.8412, + "mean_token_accuracy": 0.7333563566207886, + "num_tokens": 185084338.0, + "step": 7305 + }, + { + "epoch": 0.8023281352954096, + "grad_norm": 2.2695088386535645, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.694307804107666, + "num_tokens": 185107958.0, + "step": 7306 + }, + { + "epoch": 0.8024379529980232, + "grad_norm": 2.3488473892211914, + "learning_rate": 1e-06, + "loss": 0.8568, + "mean_token_accuracy": 0.7310682535171509, + "num_tokens": 185127694.0, + "step": 7307 + }, + { + "epoch": 0.802547770700637, + "grad_norm": 2.1585943698883057, + "learning_rate": 1e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.71070796251297, + "num_tokens": 185151963.0, + "step": 7308 + }, + { + "epoch": 0.8026575884032506, + "grad_norm": 2.5861706733703613, + "learning_rate": 1e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.7125452160835266, + "num_tokens": 185171807.0, + "step": 7309 + }, + { + "epoch": 0.8027674061058643, + "grad_norm": 2.4069619178771973, + "learning_rate": 1e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7224881052970886, + "num_tokens": 185191604.0, + "step": 7310 + }, + { + "epoch": 0.8028772238084779, + "grad_norm": 2.291067123413086, + "learning_rate": 1e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.7177768349647522, + "num_tokens": 185214873.0, + "step": 7311 + }, + { + "epoch": 0.8029870415110916, + "grad_norm": 2.053190231323242, + "learning_rate": 1e-06, + "loss": 0.8902, + "mean_token_accuracy": 0.7159984707832336, + "num_tokens": 185242536.0, + "step": 7312 + }, + { + "epoch": 0.8030968592137052, + "grad_norm": 2.248122215270996, + "learning_rate": 1e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.7121667861938477, + "num_tokens": 185265467.0, + "step": 7313 + }, + { + "epoch": 0.8032066769163189, + "grad_norm": 2.3800106048583984, + "learning_rate": 1e-06, + "loss": 0.9237, + "mean_token_accuracy": 0.7143478989601135, + "num_tokens": 185286743.0, + "step": 7314 + }, + { + "epoch": 0.8033164946189326, + "grad_norm": 2.081686496734619, + "learning_rate": 1e-06, + "loss": 0.8613, + "mean_token_accuracy": 0.7345903515815735, + "num_tokens": 185312420.0, + "step": 7315 + }, + { + "epoch": 0.8034263123215463, + "grad_norm": 2.5190300941467285, + "learning_rate": 1e-06, + "loss": 0.9071, + "mean_token_accuracy": 0.7163823246955872, + "num_tokens": 185332163.0, + "step": 7316 + }, + { + "epoch": 0.8035361300241599, + "grad_norm": 2.2340078353881836, + "learning_rate": 1e-06, + "loss": 1.0258, + "mean_token_accuracy": 0.6896949410438538, + "num_tokens": 185357542.0, + "step": 7317 + }, + { + "epoch": 0.8036459477267736, + "grad_norm": 2.1972784996032715, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.7196537256240845, + "num_tokens": 185381405.0, + "step": 7318 + }, + { + "epoch": 0.8037557654293872, + "grad_norm": 2.0770864486694336, + "learning_rate": 1e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.7160927057266235, + "num_tokens": 185407771.0, + "step": 7319 + }, + { + "epoch": 0.8038655831320008, + "grad_norm": 2.4142379760742188, + "learning_rate": 1e-06, + "loss": 0.953, + "mean_token_accuracy": 0.7038438320159912, + "num_tokens": 185429371.0, + "step": 7320 + }, + { + "epoch": 0.8039754008346145, + "grad_norm": 2.392861843109131, + "learning_rate": 1e-06, + "loss": 0.8258, + "mean_token_accuracy": 0.7335940599441528, + "num_tokens": 185450141.0, + "step": 7321 + }, + { + "epoch": 0.8040852185372283, + "grad_norm": 1.9996336698532104, + "learning_rate": 1e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.7035271525382996, + "num_tokens": 185479802.0, + "step": 7322 + }, + { + "epoch": 0.8041950362398419, + "grad_norm": 2.1829800605773926, + "learning_rate": 1e-06, + "loss": 0.8667, + "mean_token_accuracy": 0.7306562662124634, + "num_tokens": 185503978.0, + "step": 7323 + }, + { + "epoch": 0.8043048539424555, + "grad_norm": 2.191446542739868, + "learning_rate": 1e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.6994847059249878, + "num_tokens": 185529647.0, + "step": 7324 + }, + { + "epoch": 0.8044146716450692, + "grad_norm": 1.86335289478302, + "learning_rate": 1e-06, + "loss": 1.0175, + "mean_token_accuracy": 0.6936914920806885, + "num_tokens": 185564403.0, + "step": 7325 + }, + { + "epoch": 0.8045244893476828, + "grad_norm": 2.0452537536621094, + "learning_rate": 1e-06, + "loss": 0.8901, + "mean_token_accuracy": 0.7236923575401306, + "num_tokens": 185592463.0, + "step": 7326 + }, + { + "epoch": 0.8046343070502965, + "grad_norm": 2.018596649169922, + "learning_rate": 1e-06, + "loss": 0.9117, + "mean_token_accuracy": 0.7062774896621704, + "num_tokens": 185620088.0, + "step": 7327 + }, + { + "epoch": 0.8047441247529101, + "grad_norm": 2.078193426132202, + "learning_rate": 1e-06, + "loss": 0.9923, + "mean_token_accuracy": 0.7046185731887817, + "num_tokens": 185646531.0, + "step": 7328 + }, + { + "epoch": 0.8048539424555239, + "grad_norm": 2.1340394020080566, + "learning_rate": 1e-06, + "loss": 0.9786, + "mean_token_accuracy": 0.6999948024749756, + "num_tokens": 185673211.0, + "step": 7329 + }, + { + "epoch": 0.8049637601581375, + "grad_norm": 2.485201597213745, + "learning_rate": 1e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.7212258577346802, + "num_tokens": 185693291.0, + "step": 7330 + }, + { + "epoch": 0.8050735778607512, + "grad_norm": 2.1352760791778564, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7066777944564819, + "num_tokens": 185720509.0, + "step": 7331 + }, + { + "epoch": 0.8051833955633648, + "grad_norm": 2.204517364501953, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7127636671066284, + "num_tokens": 185746318.0, + "step": 7332 + }, + { + "epoch": 0.8052932132659785, + "grad_norm": 1.9225279092788696, + "learning_rate": 1e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7055400609970093, + "num_tokens": 185777413.0, + "step": 7333 + }, + { + "epoch": 0.8054030309685921, + "grad_norm": 2.163344383239746, + "learning_rate": 1e-06, + "loss": 0.9872, + "mean_token_accuracy": 0.7027156352996826, + "num_tokens": 185805770.0, + "step": 7334 + }, + { + "epoch": 0.8055128486712058, + "grad_norm": 1.9927783012390137, + "learning_rate": 1e-06, + "loss": 1.035, + "mean_token_accuracy": 0.6829229593276978, + "num_tokens": 185836925.0, + "step": 7335 + }, + { + "epoch": 0.8056226663738194, + "grad_norm": 1.9796726703643799, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.7009776830673218, + "num_tokens": 185864799.0, + "step": 7336 + }, + { + "epoch": 0.8057324840764332, + "grad_norm": 2.5131924152374268, + "learning_rate": 1e-06, + "loss": 0.8645, + "mean_token_accuracy": 0.7356294989585876, + "num_tokens": 185883112.0, + "step": 7337 + }, + { + "epoch": 0.8058423017790468, + "grad_norm": 1.8863357305526733, + "learning_rate": 1e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.7004550695419312, + "num_tokens": 185913622.0, + "step": 7338 + }, + { + "epoch": 0.8059521194816605, + "grad_norm": 2.1788222789764404, + "learning_rate": 1e-06, + "loss": 0.8965, + "mean_token_accuracy": 0.7172307968139648, + "num_tokens": 185937763.0, + "step": 7339 + }, + { + "epoch": 0.8060619371842741, + "grad_norm": 2.2352871894836426, + "learning_rate": 1e-06, + "loss": 1.0259, + "mean_token_accuracy": 0.7098941802978516, + "num_tokens": 185961177.0, + "step": 7340 + }, + { + "epoch": 0.8061717548868877, + "grad_norm": 2.2282683849334717, + "learning_rate": 1e-06, + "loss": 0.982, + "mean_token_accuracy": 0.7070605158805847, + "num_tokens": 185986379.0, + "step": 7341 + }, + { + "epoch": 0.8062815725895014, + "grad_norm": 2.191359043121338, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7137254476547241, + "num_tokens": 186011565.0, + "step": 7342 + }, + { + "epoch": 0.806391390292115, + "grad_norm": 2.1954076290130615, + "learning_rate": 1e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.7074209451675415, + "num_tokens": 186035494.0, + "step": 7343 + }, + { + "epoch": 0.8065012079947288, + "grad_norm": 2.2809677124023438, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7173580527305603, + "num_tokens": 186058244.0, + "step": 7344 + }, + { + "epoch": 0.8066110256973424, + "grad_norm": 2.23437237739563, + "learning_rate": 1e-06, + "loss": 1.0363, + "mean_token_accuracy": 0.6857013702392578, + "num_tokens": 186084780.0, + "step": 7345 + }, + { + "epoch": 0.8067208433999561, + "grad_norm": 2.1285204887390137, + "learning_rate": 1e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.7179172039031982, + "num_tokens": 186110704.0, + "step": 7346 + }, + { + "epoch": 0.8068306611025697, + "grad_norm": 2.1855225563049316, + "learning_rate": 1e-06, + "loss": 1.0484, + "mean_token_accuracy": 0.6875545382499695, + "num_tokens": 186136450.0, + "step": 7347 + }, + { + "epoch": 0.8069404788051834, + "grad_norm": 1.874559998512268, + "learning_rate": 1e-06, + "loss": 1.0206, + "mean_token_accuracy": 0.6889562010765076, + "num_tokens": 186170282.0, + "step": 7348 + }, + { + "epoch": 0.807050296507797, + "grad_norm": 2.2642083168029785, + "learning_rate": 1e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.7051990032196045, + "num_tokens": 186194037.0, + "step": 7349 + }, + { + "epoch": 0.8071601142104107, + "grad_norm": 1.9840588569641113, + "learning_rate": 1e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.7033522725105286, + "num_tokens": 186221689.0, + "step": 7350 + }, + { + "epoch": 0.8072699319130244, + "grad_norm": 2.2774574756622314, + "learning_rate": 1e-06, + "loss": 1.0358, + "mean_token_accuracy": 0.6872159838676453, + "num_tokens": 186247045.0, + "step": 7351 + }, + { + "epoch": 0.8073797496156381, + "grad_norm": 2.2996082305908203, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.704163670539856, + "num_tokens": 186270594.0, + "step": 7352 + }, + { + "epoch": 0.8074895673182517, + "grad_norm": 2.126765012741089, + "learning_rate": 1e-06, + "loss": 0.8984, + "mean_token_accuracy": 0.718869686126709, + "num_tokens": 186296124.0, + "step": 7353 + }, + { + "epoch": 0.8075993850208654, + "grad_norm": 2.199634075164795, + "learning_rate": 1e-06, + "loss": 1.0238, + "mean_token_accuracy": 0.6832975745201111, + "num_tokens": 186321321.0, + "step": 7354 + }, + { + "epoch": 0.807709202723479, + "grad_norm": 2.2481563091278076, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.719277560710907, + "num_tokens": 186344359.0, + "step": 7355 + }, + { + "epoch": 0.8078190204260927, + "grad_norm": 2.0743331909179688, + "learning_rate": 1e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.6915630102157593, + "num_tokens": 186372502.0, + "step": 7356 + }, + { + "epoch": 0.8079288381287063, + "grad_norm": 2.101390838623047, + "learning_rate": 1e-06, + "loss": 0.8604, + "mean_token_accuracy": 0.732338547706604, + "num_tokens": 186397155.0, + "step": 7357 + }, + { + "epoch": 0.8080386558313201, + "grad_norm": 2.2201852798461914, + "learning_rate": 1e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.7058870792388916, + "num_tokens": 186420229.0, + "step": 7358 + }, + { + "epoch": 0.8081484735339337, + "grad_norm": 2.4194176197052, + "learning_rate": 1e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.7146300077438354, + "num_tokens": 186441844.0, + "step": 7359 + }, + { + "epoch": 0.8082582912365474, + "grad_norm": 2.208885908126831, + "learning_rate": 1e-06, + "loss": 0.8664, + "mean_token_accuracy": 0.7295805811882019, + "num_tokens": 186464683.0, + "step": 7360 + }, + { + "epoch": 0.808368108939161, + "grad_norm": 1.9835699796676636, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7083983421325684, + "num_tokens": 186495655.0, + "step": 7361 + }, + { + "epoch": 0.8084779266417746, + "grad_norm": 2.2966151237487793, + "learning_rate": 1e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.7135127186775208, + "num_tokens": 186517320.0, + "step": 7362 + }, + { + "epoch": 0.8085877443443883, + "grad_norm": 2.2068111896514893, + "learning_rate": 1e-06, + "loss": 0.9697, + "mean_token_accuracy": 0.7073948383331299, + "num_tokens": 186542370.0, + "step": 7363 + }, + { + "epoch": 0.8086975620470019, + "grad_norm": 2.290052890777588, + "learning_rate": 1e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.7059684991836548, + "num_tokens": 186566134.0, + "step": 7364 + }, + { + "epoch": 0.8088073797496156, + "grad_norm": 2.2215144634246826, + "learning_rate": 1e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.7206302881240845, + "num_tokens": 186589924.0, + "step": 7365 + }, + { + "epoch": 0.8089171974522293, + "grad_norm": 2.0131266117095947, + "learning_rate": 1e-06, + "loss": 1.0199, + "mean_token_accuracy": 0.6907839775085449, + "num_tokens": 186621222.0, + "step": 7366 + }, + { + "epoch": 0.809027015154843, + "grad_norm": 2.363713502883911, + "learning_rate": 1e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.7226773500442505, + "num_tokens": 186643054.0, + "step": 7367 + }, + { + "epoch": 0.8091368328574566, + "grad_norm": 2.4889795780181885, + "learning_rate": 1e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.7081238627433777, + "num_tokens": 186665062.0, + "step": 7368 + }, + { + "epoch": 0.8092466505600703, + "grad_norm": 1.9967217445373535, + "learning_rate": 1e-06, + "loss": 0.9809, + "mean_token_accuracy": 0.7013620138168335, + "num_tokens": 186695215.0, + "step": 7369 + }, + { + "epoch": 0.8093564682626839, + "grad_norm": 2.3432669639587402, + "learning_rate": 1e-06, + "loss": 0.9411, + "mean_token_accuracy": 0.7115412950515747, + "num_tokens": 186720727.0, + "step": 7370 + }, + { + "epoch": 0.8094662859652976, + "grad_norm": 2.1592297554016113, + "learning_rate": 1e-06, + "loss": 1.0342, + "mean_token_accuracy": 0.697176456451416, + "num_tokens": 186747896.0, + "step": 7371 + }, + { + "epoch": 0.8095761036679112, + "grad_norm": 2.2147486209869385, + "learning_rate": 1e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7233229875564575, + "num_tokens": 186773225.0, + "step": 7372 + }, + { + "epoch": 0.809685921370525, + "grad_norm": 2.1703500747680664, + "learning_rate": 1e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.6939593553543091, + "num_tokens": 186800284.0, + "step": 7373 + }, + { + "epoch": 0.8097957390731386, + "grad_norm": 2.3943679332733154, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7084142565727234, + "num_tokens": 186822468.0, + "step": 7374 + }, + { + "epoch": 0.8099055567757523, + "grad_norm": 2.0552706718444824, + "learning_rate": 1e-06, + "loss": 1.045, + "mean_token_accuracy": 0.6967395544052124, + "num_tokens": 186851148.0, + "step": 7375 + }, + { + "epoch": 0.8100153744783659, + "grad_norm": 2.1768126487731934, + "learning_rate": 1e-06, + "loss": 0.9247, + "mean_token_accuracy": 0.7119061946868896, + "num_tokens": 186876215.0, + "step": 7376 + }, + { + "epoch": 0.8101251921809796, + "grad_norm": 2.074300527572632, + "learning_rate": 1e-06, + "loss": 1.0211, + "mean_token_accuracy": 0.7088993787765503, + "num_tokens": 186902824.0, + "step": 7377 + }, + { + "epoch": 0.8102350098835932, + "grad_norm": 2.237598419189453, + "learning_rate": 1e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.7084651589393616, + "num_tokens": 186926510.0, + "step": 7378 + }, + { + "epoch": 0.8103448275862069, + "grad_norm": 2.2452290058135986, + "learning_rate": 1e-06, + "loss": 0.9841, + "mean_token_accuracy": 0.6923994421958923, + "num_tokens": 186951194.0, + "step": 7379 + }, + { + "epoch": 0.8104546452888206, + "grad_norm": 2.265324592590332, + "learning_rate": 1e-06, + "loss": 1.0086, + "mean_token_accuracy": 0.6967403888702393, + "num_tokens": 186974794.0, + "step": 7380 + }, + { + "epoch": 0.8105644629914343, + "grad_norm": 2.0999488830566406, + "learning_rate": 1e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.6925450563430786, + "num_tokens": 187000883.0, + "step": 7381 + }, + { + "epoch": 0.8106742806940479, + "grad_norm": 2.2109487056732178, + "learning_rate": 1e-06, + "loss": 0.992, + "mean_token_accuracy": 0.696079432964325, + "num_tokens": 187025198.0, + "step": 7382 + }, + { + "epoch": 0.8107840983966615, + "grad_norm": 2.498208522796631, + "learning_rate": 1e-06, + "loss": 0.9089, + "mean_token_accuracy": 0.7188907861709595, + "num_tokens": 187044945.0, + "step": 7383 + }, + { + "epoch": 0.8108939160992752, + "grad_norm": 2.1331801414489746, + "learning_rate": 1e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.7135237455368042, + "num_tokens": 187071757.0, + "step": 7384 + }, + { + "epoch": 0.8110037338018888, + "grad_norm": 2.2336161136627197, + "learning_rate": 1e-06, + "loss": 0.8445, + "mean_token_accuracy": 0.7444769144058228, + "num_tokens": 187095930.0, + "step": 7385 + }, + { + "epoch": 0.8111135515045025, + "grad_norm": 2.194629669189453, + "learning_rate": 1e-06, + "loss": 0.9131, + "mean_token_accuracy": 0.714895486831665, + "num_tokens": 187120236.0, + "step": 7386 + }, + { + "epoch": 0.8112233692071162, + "grad_norm": 2.209803342819214, + "learning_rate": 1e-06, + "loss": 1.0542, + "mean_token_accuracy": 0.6897425055503845, + "num_tokens": 187146857.0, + "step": 7387 + }, + { + "epoch": 0.8113331869097299, + "grad_norm": 2.085522174835205, + "learning_rate": 1e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.712920069694519, + "num_tokens": 187173258.0, + "step": 7388 + }, + { + "epoch": 0.8114430046123435, + "grad_norm": 2.1042957305908203, + "learning_rate": 1e-06, + "loss": 0.9487, + "mean_token_accuracy": 0.7112253904342651, + "num_tokens": 187202740.0, + "step": 7389 + }, + { + "epoch": 0.8115528223149572, + "grad_norm": 2.298886299133301, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.7004973888397217, + "num_tokens": 187227125.0, + "step": 7390 + }, + { + "epoch": 0.8116626400175708, + "grad_norm": 2.296447277069092, + "learning_rate": 1e-06, + "loss": 0.8896, + "mean_token_accuracy": 0.7195984125137329, + "num_tokens": 187249309.0, + "step": 7391 + }, + { + "epoch": 0.8117724577201845, + "grad_norm": 2.077605724334717, + "learning_rate": 1e-06, + "loss": 0.9868, + "mean_token_accuracy": 0.7069506645202637, + "num_tokens": 187276813.0, + "step": 7392 + }, + { + "epoch": 0.8118822754227981, + "grad_norm": 2.2233686447143555, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.714459240436554, + "num_tokens": 187300757.0, + "step": 7393 + }, + { + "epoch": 0.8119920931254118, + "grad_norm": 2.5314853191375732, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7186509370803833, + "num_tokens": 187319878.0, + "step": 7394 + }, + { + "epoch": 0.8121019108280255, + "grad_norm": 2.0783908367156982, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.7085370421409607, + "num_tokens": 187347179.0, + "step": 7395 + }, + { + "epoch": 0.8122117285306392, + "grad_norm": 2.255706548690796, + "learning_rate": 1e-06, + "loss": 0.9179, + "mean_token_accuracy": 0.7108641862869263, + "num_tokens": 187369705.0, + "step": 7396 + }, + { + "epoch": 0.8123215462332528, + "grad_norm": 2.2274892330169678, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.715180516242981, + "num_tokens": 187393221.0, + "step": 7397 + }, + { + "epoch": 0.8124313639358665, + "grad_norm": 2.0579893589019775, + "learning_rate": 1e-06, + "loss": 0.969, + "mean_token_accuracy": 0.7072293758392334, + "num_tokens": 187420639.0, + "step": 7398 + }, + { + "epoch": 0.8125411816384801, + "grad_norm": 2.445507526397705, + "learning_rate": 1e-06, + "loss": 0.8556, + "mean_token_accuracy": 0.730506181716919, + "num_tokens": 187441167.0, + "step": 7399 + }, + { + "epoch": 0.8126509993410937, + "grad_norm": 2.477529287338257, + "learning_rate": 1e-06, + "loss": 1.0334, + "mean_token_accuracy": 0.6872724890708923, + "num_tokens": 187462383.0, + "step": 7400 + }, + { + "epoch": 0.8127608170437074, + "grad_norm": 2.440798044204712, + "learning_rate": 1e-06, + "loss": 0.8718, + "mean_token_accuracy": 0.7264153957366943, + "num_tokens": 187482406.0, + "step": 7401 + }, + { + "epoch": 0.8128706347463212, + "grad_norm": 2.217252731323242, + "learning_rate": 1e-06, + "loss": 0.9349, + "mean_token_accuracy": 0.7201835513114929, + "num_tokens": 187507608.0, + "step": 7402 + }, + { + "epoch": 0.8129804524489348, + "grad_norm": 2.0849714279174805, + "learning_rate": 1e-06, + "loss": 0.9146, + "mean_token_accuracy": 0.7173968553543091, + "num_tokens": 187534806.0, + "step": 7403 + }, + { + "epoch": 0.8130902701515484, + "grad_norm": 2.2599587440490723, + "learning_rate": 1e-06, + "loss": 1.0931, + "mean_token_accuracy": 0.6843116283416748, + "num_tokens": 187559731.0, + "step": 7404 + }, + { + "epoch": 0.8132000878541621, + "grad_norm": 1.964832067489624, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7100121974945068, + "num_tokens": 187590127.0, + "step": 7405 + }, + { + "epoch": 0.8133099055567757, + "grad_norm": 2.035752296447754, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7090725898742676, + "num_tokens": 187617266.0, + "step": 7406 + }, + { + "epoch": 0.8134197232593894, + "grad_norm": 1.9955036640167236, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7035322189331055, + "num_tokens": 187649293.0, + "step": 7407 + }, + { + "epoch": 0.813529540962003, + "grad_norm": 2.2825489044189453, + "learning_rate": 1e-06, + "loss": 1.0517, + "mean_token_accuracy": 0.6840019226074219, + "num_tokens": 187674451.0, + "step": 7408 + }, + { + "epoch": 0.8136393586646168, + "grad_norm": 2.2001161575317383, + "learning_rate": 1e-06, + "loss": 0.9661, + "mean_token_accuracy": 0.7058374881744385, + "num_tokens": 187699850.0, + "step": 7409 + }, + { + "epoch": 0.8137491763672304, + "grad_norm": 2.14801287651062, + "learning_rate": 1e-06, + "loss": 0.88, + "mean_token_accuracy": 0.7220932245254517, + "num_tokens": 187725783.0, + "step": 7410 + }, + { + "epoch": 0.8138589940698441, + "grad_norm": 2.489640951156616, + "learning_rate": 1e-06, + "loss": 0.9349, + "mean_token_accuracy": 0.7117202281951904, + "num_tokens": 187746199.0, + "step": 7411 + }, + { + "epoch": 0.8139688117724577, + "grad_norm": 2.3915421962738037, + "learning_rate": 1e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.737686276435852, + "num_tokens": 187768213.0, + "step": 7412 + }, + { + "epoch": 0.8140786294750714, + "grad_norm": 2.4145047664642334, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7080485224723816, + "num_tokens": 187791019.0, + "step": 7413 + }, + { + "epoch": 0.814188447177685, + "grad_norm": 2.153252363204956, + "learning_rate": 1e-06, + "loss": 0.8852, + "mean_token_accuracy": 0.7256404161453247, + "num_tokens": 187816469.0, + "step": 7414 + }, + { + "epoch": 0.8142982648802987, + "grad_norm": 2.402604818344116, + "learning_rate": 1e-06, + "loss": 0.9664, + "mean_token_accuracy": 0.7045720815658569, + "num_tokens": 187838388.0, + "step": 7415 + }, + { + "epoch": 0.8144080825829124, + "grad_norm": 2.2407045364379883, + "learning_rate": 1e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.7144184112548828, + "num_tokens": 187861546.0, + "step": 7416 + }, + { + "epoch": 0.8145179002855261, + "grad_norm": 2.5291528701782227, + "learning_rate": 1e-06, + "loss": 1.01, + "mean_token_accuracy": 0.6983492374420166, + "num_tokens": 187883026.0, + "step": 7417 + }, + { + "epoch": 0.8146277179881397, + "grad_norm": 2.69877028465271, + "learning_rate": 1e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.7064790725708008, + "num_tokens": 187903401.0, + "step": 7418 + }, + { + "epoch": 0.8147375356907534, + "grad_norm": 2.156109094619751, + "learning_rate": 1e-06, + "loss": 0.8585, + "mean_token_accuracy": 0.7419854402542114, + "num_tokens": 187926856.0, + "step": 7419 + }, + { + "epoch": 0.814847353393367, + "grad_norm": 1.9429484605789185, + "learning_rate": 1e-06, + "loss": 0.9926, + "mean_token_accuracy": 0.6951427459716797, + "num_tokens": 187958879.0, + "step": 7420 + }, + { + "epoch": 0.8149571710959806, + "grad_norm": 2.3632378578186035, + "learning_rate": 1e-06, + "loss": 0.8495, + "mean_token_accuracy": 0.7359155416488647, + "num_tokens": 187980643.0, + "step": 7421 + }, + { + "epoch": 0.8150669887985943, + "grad_norm": 2.0960288047790527, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.6909513473510742, + "num_tokens": 188008652.0, + "step": 7422 + }, + { + "epoch": 0.8151768065012079, + "grad_norm": 2.037247896194458, + "learning_rate": 1e-06, + "loss": 1.0595, + "mean_token_accuracy": 0.6786471605300903, + "num_tokens": 188038569.0, + "step": 7423 + }, + { + "epoch": 0.8152866242038217, + "grad_norm": 2.6576719284057617, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.7219740152359009, + "num_tokens": 188056461.0, + "step": 7424 + }, + { + "epoch": 0.8153964419064353, + "grad_norm": 1.9131029844284058, + "learning_rate": 1e-06, + "loss": 1.0096, + "mean_token_accuracy": 0.6882889866828918, + "num_tokens": 188087755.0, + "step": 7425 + }, + { + "epoch": 0.815506259609049, + "grad_norm": 2.5325803756713867, + "learning_rate": 1e-06, + "loss": 0.8476, + "mean_token_accuracy": 0.7319672703742981, + "num_tokens": 188107112.0, + "step": 7426 + }, + { + "epoch": 0.8156160773116626, + "grad_norm": 2.2583858966827393, + "learning_rate": 1e-06, + "loss": 0.943, + "mean_token_accuracy": 0.7229128479957581, + "num_tokens": 188131989.0, + "step": 7427 + }, + { + "epoch": 0.8157258950142763, + "grad_norm": 2.012831449508667, + "learning_rate": 1e-06, + "loss": 1.0168, + "mean_token_accuracy": 0.697102427482605, + "num_tokens": 188164195.0, + "step": 7428 + }, + { + "epoch": 0.8158357127168899, + "grad_norm": 2.581521511077881, + "learning_rate": 1e-06, + "loss": 0.9002, + "mean_token_accuracy": 0.7166839838027954, + "num_tokens": 188183001.0, + "step": 7429 + }, + { + "epoch": 0.8159455304195036, + "grad_norm": 2.1921355724334717, + "learning_rate": 1e-06, + "loss": 0.999, + "mean_token_accuracy": 0.6931834816932678, + "num_tokens": 188207943.0, + "step": 7430 + }, + { + "epoch": 0.8160553481221173, + "grad_norm": 2.121765375137329, + "learning_rate": 1e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7119123339653015, + "num_tokens": 188234737.0, + "step": 7431 + }, + { + "epoch": 0.816165165824731, + "grad_norm": 2.1522231101989746, + "learning_rate": 1e-06, + "loss": 0.9237, + "mean_token_accuracy": 0.710952639579773, + "num_tokens": 188260063.0, + "step": 7432 + }, + { + "epoch": 0.8162749835273446, + "grad_norm": 2.3998146057128906, + "learning_rate": 1e-06, + "loss": 0.9262, + "mean_token_accuracy": 0.7129718065261841, + "num_tokens": 188282090.0, + "step": 7433 + }, + { + "epoch": 0.8163848012299583, + "grad_norm": 2.086317539215088, + "learning_rate": 1e-06, + "loss": 1.0543, + "mean_token_accuracy": 0.6845391988754272, + "num_tokens": 188309722.0, + "step": 7434 + }, + { + "epoch": 0.8164946189325719, + "grad_norm": 2.026341199874878, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7123627662658691, + "num_tokens": 188340090.0, + "step": 7435 + }, + { + "epoch": 0.8166044366351856, + "grad_norm": 1.9452462196350098, + "learning_rate": 1e-06, + "loss": 1.0319, + "mean_token_accuracy": 0.6957197189331055, + "num_tokens": 188372474.0, + "step": 7436 + }, + { + "epoch": 0.8167142543377992, + "grad_norm": 2.1159448623657227, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.706870436668396, + "num_tokens": 188396729.0, + "step": 7437 + }, + { + "epoch": 0.816824072040413, + "grad_norm": 2.017747163772583, + "learning_rate": 1e-06, + "loss": 1.0157, + "mean_token_accuracy": 0.6928375959396362, + "num_tokens": 188426675.0, + "step": 7438 + }, + { + "epoch": 0.8169338897430266, + "grad_norm": 2.3815486431121826, + "learning_rate": 1e-06, + "loss": 0.8874, + "mean_token_accuracy": 0.723867654800415, + "num_tokens": 188447405.0, + "step": 7439 + }, + { + "epoch": 0.8170437074456403, + "grad_norm": 2.2137246131896973, + "learning_rate": 1e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7107031345367432, + "num_tokens": 188473303.0, + "step": 7440 + }, + { + "epoch": 0.8171535251482539, + "grad_norm": 1.9352917671203613, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.6969825625419617, + "num_tokens": 188504901.0, + "step": 7441 + }, + { + "epoch": 0.8172633428508675, + "grad_norm": 2.1347594261169434, + "learning_rate": 1e-06, + "loss": 0.8919, + "mean_token_accuracy": 0.7266260981559753, + "num_tokens": 188530808.0, + "step": 7442 + }, + { + "epoch": 0.8173731605534812, + "grad_norm": 2.36348819732666, + "learning_rate": 1e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.7064979076385498, + "num_tokens": 188552345.0, + "step": 7443 + }, + { + "epoch": 0.8174829782560948, + "grad_norm": 2.5187647342681885, + "learning_rate": 1e-06, + "loss": 0.944, + "mean_token_accuracy": 0.7129697799682617, + "num_tokens": 188573291.0, + "step": 7444 + }, + { + "epoch": 0.8175927959587086, + "grad_norm": 2.051943778991699, + "learning_rate": 1e-06, + "loss": 0.9799, + "mean_token_accuracy": 0.6973501443862915, + "num_tokens": 188602441.0, + "step": 7445 + }, + { + "epoch": 0.8177026136613222, + "grad_norm": 2.2645084857940674, + "learning_rate": 1e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.7087358236312866, + "num_tokens": 188626558.0, + "step": 7446 + }, + { + "epoch": 0.8178124313639359, + "grad_norm": 2.172962188720703, + "learning_rate": 1e-06, + "loss": 0.8617, + "mean_token_accuracy": 0.7326356172561646, + "num_tokens": 188652735.0, + "step": 7447 + }, + { + "epoch": 0.8179222490665495, + "grad_norm": 2.2324817180633545, + "learning_rate": 1e-06, + "loss": 1.0322, + "mean_token_accuracy": 0.68168044090271, + "num_tokens": 188677784.0, + "step": 7448 + }, + { + "epoch": 0.8180320667691632, + "grad_norm": 2.0879557132720947, + "learning_rate": 1e-06, + "loss": 0.954, + "mean_token_accuracy": 0.7065563797950745, + "num_tokens": 188707172.0, + "step": 7449 + }, + { + "epoch": 0.8181418844717768, + "grad_norm": 2.3924708366394043, + "learning_rate": 1e-06, + "loss": 1.0211, + "mean_token_accuracy": 0.6922304630279541, + "num_tokens": 188731031.0, + "step": 7450 + }, + { + "epoch": 0.8182517021743905, + "grad_norm": 2.1994822025299072, + "learning_rate": 1e-06, + "loss": 0.9027, + "mean_token_accuracy": 0.7265154123306274, + "num_tokens": 188755529.0, + "step": 7451 + }, + { + "epoch": 0.8183615198770041, + "grad_norm": 1.905684471130371, + "learning_rate": 1e-06, + "loss": 0.9951, + "mean_token_accuracy": 0.6961706280708313, + "num_tokens": 188786919.0, + "step": 7452 + }, + { + "epoch": 0.8184713375796179, + "grad_norm": 2.2287344932556152, + "learning_rate": 1e-06, + "loss": 1.0146, + "mean_token_accuracy": 0.6977101564407349, + "num_tokens": 188813562.0, + "step": 7453 + }, + { + "epoch": 0.8185811552822315, + "grad_norm": 2.2250113487243652, + "learning_rate": 1e-06, + "loss": 0.8593, + "mean_token_accuracy": 0.7354511618614197, + "num_tokens": 188837149.0, + "step": 7454 + }, + { + "epoch": 0.8186909729848452, + "grad_norm": 2.3330349922180176, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.6997615098953247, + "num_tokens": 188859196.0, + "step": 7455 + }, + { + "epoch": 0.8188007906874588, + "grad_norm": 2.0303597450256348, + "learning_rate": 1e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.7070066332817078, + "num_tokens": 188886882.0, + "step": 7456 + }, + { + "epoch": 0.8189106083900725, + "grad_norm": 2.0444672107696533, + "learning_rate": 1e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.687549889087677, + "num_tokens": 188914896.0, + "step": 7457 + }, + { + "epoch": 0.8190204260926861, + "grad_norm": 2.211869716644287, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.7092639207839966, + "num_tokens": 188941721.0, + "step": 7458 + }, + { + "epoch": 0.8191302437952998, + "grad_norm": 2.04913592338562, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.7128007411956787, + "num_tokens": 188969925.0, + "step": 7459 + }, + { + "epoch": 0.8192400614979135, + "grad_norm": 2.303190231323242, + "learning_rate": 1e-06, + "loss": 0.8712, + "mean_token_accuracy": 0.7288310527801514, + "num_tokens": 188990693.0, + "step": 7460 + }, + { + "epoch": 0.8193498792005272, + "grad_norm": 2.1796371936798096, + "learning_rate": 1e-06, + "loss": 1.0425, + "mean_token_accuracy": 0.692867636680603, + "num_tokens": 189015286.0, + "step": 7461 + }, + { + "epoch": 0.8194596969031408, + "grad_norm": 2.2771952152252197, + "learning_rate": 1e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.6875495314598083, + "num_tokens": 189040486.0, + "step": 7462 + }, + { + "epoch": 0.8195695146057544, + "grad_norm": 1.989027976989746, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.6994048357009888, + "num_tokens": 189069533.0, + "step": 7463 + }, + { + "epoch": 0.8196793323083681, + "grad_norm": 2.2324438095092773, + "learning_rate": 1e-06, + "loss": 1.0714, + "mean_token_accuracy": 0.6738690137863159, + "num_tokens": 189095674.0, + "step": 7464 + }, + { + "epoch": 0.8197891500109817, + "grad_norm": 2.1774463653564453, + "learning_rate": 1e-06, + "loss": 0.9183, + "mean_token_accuracy": 0.7122802138328552, + "num_tokens": 189118461.0, + "step": 7465 + }, + { + "epoch": 0.8198989677135954, + "grad_norm": 2.2173709869384766, + "learning_rate": 1e-06, + "loss": 0.9407, + "mean_token_accuracy": 0.7062736749649048, + "num_tokens": 189141705.0, + "step": 7466 + }, + { + "epoch": 0.8200087854162091, + "grad_norm": 2.3803932666778564, + "learning_rate": 1e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.7096778750419617, + "num_tokens": 189164022.0, + "step": 7467 + }, + { + "epoch": 0.8201186031188228, + "grad_norm": 2.21427059173584, + "learning_rate": 1e-06, + "loss": 0.9182, + "mean_token_accuracy": 0.7169609665870667, + "num_tokens": 189188161.0, + "step": 7468 + }, + { + "epoch": 0.8202284208214364, + "grad_norm": 2.051880359649658, + "learning_rate": 1e-06, + "loss": 1.0051, + "mean_token_accuracy": 0.6944913864135742, + "num_tokens": 189217343.0, + "step": 7469 + }, + { + "epoch": 0.8203382385240501, + "grad_norm": 2.071566581726074, + "learning_rate": 1e-06, + "loss": 0.9746, + "mean_token_accuracy": 0.6960282325744629, + "num_tokens": 189245830.0, + "step": 7470 + }, + { + "epoch": 0.8204480562266637, + "grad_norm": 2.4060096740722656, + "learning_rate": 1e-06, + "loss": 0.9203, + "mean_token_accuracy": 0.715954601764679, + "num_tokens": 189267996.0, + "step": 7471 + }, + { + "epoch": 0.8205578739292774, + "grad_norm": 2.1217825412750244, + "learning_rate": 1e-06, + "loss": 0.9037, + "mean_token_accuracy": 0.7254940271377563, + "num_tokens": 189293941.0, + "step": 7472 + }, + { + "epoch": 0.820667691631891, + "grad_norm": 2.2708899974823, + "learning_rate": 1e-06, + "loss": 0.8908, + "mean_token_accuracy": 0.7192811965942383, + "num_tokens": 189317081.0, + "step": 7473 + }, + { + "epoch": 0.8207775093345048, + "grad_norm": 2.350562334060669, + "learning_rate": 1e-06, + "loss": 0.9786, + "mean_token_accuracy": 0.7067335247993469, + "num_tokens": 189339297.0, + "step": 7474 + }, + { + "epoch": 0.8208873270371184, + "grad_norm": 2.470792293548584, + "learning_rate": 1e-06, + "loss": 0.8407, + "mean_token_accuracy": 0.7338757514953613, + "num_tokens": 189360121.0, + "step": 7475 + }, + { + "epoch": 0.8209971447397321, + "grad_norm": 2.1462008953094482, + "learning_rate": 1e-06, + "loss": 0.8789, + "mean_token_accuracy": 0.7298957109451294, + "num_tokens": 189385105.0, + "step": 7476 + }, + { + "epoch": 0.8211069624423457, + "grad_norm": 2.2556514739990234, + "learning_rate": 1e-06, + "loss": 0.9916, + "mean_token_accuracy": 0.7033640146255493, + "num_tokens": 189409962.0, + "step": 7477 + }, + { + "epoch": 0.8212167801449594, + "grad_norm": 2.346679449081421, + "learning_rate": 1e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.70162034034729, + "num_tokens": 189433092.0, + "step": 7478 + }, + { + "epoch": 0.821326597847573, + "grad_norm": 2.0953519344329834, + "learning_rate": 1e-06, + "loss": 0.9588, + "mean_token_accuracy": 0.7013354897499084, + "num_tokens": 189461076.0, + "step": 7479 + }, + { + "epoch": 0.8214364155501866, + "grad_norm": 2.0623106956481934, + "learning_rate": 1e-06, + "loss": 1.0422, + "mean_token_accuracy": 0.6837964057922363, + "num_tokens": 189490795.0, + "step": 7480 + }, + { + "epoch": 0.8215462332528004, + "grad_norm": 1.9693975448608398, + "learning_rate": 1e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.6990842819213867, + "num_tokens": 189520157.0, + "step": 7481 + }, + { + "epoch": 0.821656050955414, + "grad_norm": 2.078141450881958, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.710058331489563, + "num_tokens": 189547567.0, + "step": 7482 + }, + { + "epoch": 0.8217658686580277, + "grad_norm": 2.119971752166748, + "learning_rate": 1e-06, + "loss": 1.0531, + "mean_token_accuracy": 0.6816635131835938, + "num_tokens": 189577177.0, + "step": 7483 + }, + { + "epoch": 0.8218756863606413, + "grad_norm": 2.147737979888916, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.704485297203064, + "num_tokens": 189603271.0, + "step": 7484 + }, + { + "epoch": 0.821985504063255, + "grad_norm": 2.1567275524139404, + "learning_rate": 1e-06, + "loss": 0.9529, + "mean_token_accuracy": 0.7077123522758484, + "num_tokens": 189629652.0, + "step": 7485 + }, + { + "epoch": 0.8220953217658686, + "grad_norm": 2.116302013397217, + "learning_rate": 1e-06, + "loss": 1.0103, + "mean_token_accuracy": 0.6947823762893677, + "num_tokens": 189655268.0, + "step": 7486 + }, + { + "epoch": 0.8222051394684823, + "grad_norm": 2.507478713989258, + "learning_rate": 1e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7193660140037537, + "num_tokens": 189675726.0, + "step": 7487 + }, + { + "epoch": 0.8223149571710959, + "grad_norm": 2.322021961212158, + "learning_rate": 1e-06, + "loss": 0.8703, + "mean_token_accuracy": 0.7341382503509521, + "num_tokens": 189698925.0, + "step": 7488 + }, + { + "epoch": 0.8224247748737097, + "grad_norm": 2.284731149673462, + "learning_rate": 1e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.700567901134491, + "num_tokens": 189723959.0, + "step": 7489 + }, + { + "epoch": 0.8225345925763233, + "grad_norm": 1.9268475770950317, + "learning_rate": 1e-06, + "loss": 1.0089, + "mean_token_accuracy": 0.6938878893852234, + "num_tokens": 189755342.0, + "step": 7490 + }, + { + "epoch": 0.822644410278937, + "grad_norm": 2.041048526763916, + "learning_rate": 1e-06, + "loss": 0.9188, + "mean_token_accuracy": 0.7230608463287354, + "num_tokens": 189783788.0, + "step": 7491 + }, + { + "epoch": 0.8227542279815506, + "grad_norm": 2.254528045654297, + "learning_rate": 1e-06, + "loss": 0.9987, + "mean_token_accuracy": 0.7008959054946899, + "num_tokens": 189807310.0, + "step": 7492 + }, + { + "epoch": 0.8228640456841643, + "grad_norm": 2.0756242275238037, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.6961146593093872, + "num_tokens": 189836710.0, + "step": 7493 + }, + { + "epoch": 0.8229738633867779, + "grad_norm": 2.227231740951538, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7051125168800354, + "num_tokens": 189859933.0, + "step": 7494 + }, + { + "epoch": 0.8230836810893916, + "grad_norm": 2.2814066410064697, + "learning_rate": 1e-06, + "loss": 1.051, + "mean_token_accuracy": 0.6881593465805054, + "num_tokens": 189885383.0, + "step": 7495 + }, + { + "epoch": 0.8231934987920053, + "grad_norm": 2.1296801567077637, + "learning_rate": 1e-06, + "loss": 1.0001, + "mean_token_accuracy": 0.701347827911377, + "num_tokens": 189910851.0, + "step": 7496 + }, + { + "epoch": 0.823303316494619, + "grad_norm": 2.292677164077759, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.7081058621406555, + "num_tokens": 189932710.0, + "step": 7497 + }, + { + "epoch": 0.8234131341972326, + "grad_norm": 2.346304178237915, + "learning_rate": 1e-06, + "loss": 0.8942, + "mean_token_accuracy": 0.7288112044334412, + "num_tokens": 189955939.0, + "step": 7498 + }, + { + "epoch": 0.8235229518998463, + "grad_norm": 2.2512261867523193, + "learning_rate": 1e-06, + "loss": 0.9162, + "mean_token_accuracy": 0.7241278886795044, + "num_tokens": 189977012.0, + "step": 7499 + }, + { + "epoch": 0.8236327696024599, + "grad_norm": 2.0824899673461914, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.709526777267456, + "num_tokens": 190003833.0, + "step": 7500 + }, + { + "epoch": 0.8237425873050735, + "grad_norm": 2.278834342956543, + "learning_rate": 1e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.723082959651947, + "num_tokens": 190025981.0, + "step": 7501 + }, + { + "epoch": 0.8238524050076872, + "grad_norm": 2.449636220932007, + "learning_rate": 1e-06, + "loss": 0.8218, + "mean_token_accuracy": 0.7427937984466553, + "num_tokens": 190045738.0, + "step": 7502 + }, + { + "epoch": 0.823962222710301, + "grad_norm": 2.1665990352630615, + "learning_rate": 1e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.705168604850769, + "num_tokens": 190070652.0, + "step": 7503 + }, + { + "epoch": 0.8240720404129146, + "grad_norm": 2.4879140853881836, + "learning_rate": 1e-06, + "loss": 0.8202, + "mean_token_accuracy": 0.7425706386566162, + "num_tokens": 190089686.0, + "step": 7504 + }, + { + "epoch": 0.8241818581155282, + "grad_norm": 2.2717506885528564, + "learning_rate": 1e-06, + "loss": 0.9629, + "mean_token_accuracy": 0.7029984593391418, + "num_tokens": 190112442.0, + "step": 7505 + }, + { + "epoch": 0.8242916758181419, + "grad_norm": 2.1450045108795166, + "learning_rate": 1e-06, + "loss": 0.8896, + "mean_token_accuracy": 0.7271453142166138, + "num_tokens": 190136433.0, + "step": 7506 + }, + { + "epoch": 0.8244014935207555, + "grad_norm": 2.4669559001922607, + "learning_rate": 1e-06, + "loss": 0.8976, + "mean_token_accuracy": 0.719753623008728, + "num_tokens": 190157325.0, + "step": 7507 + }, + { + "epoch": 0.8245113112233692, + "grad_norm": 2.077026844024658, + "learning_rate": 1e-06, + "loss": 0.8524, + "mean_token_accuracy": 0.7320125102996826, + "num_tokens": 190184356.0, + "step": 7508 + }, + { + "epoch": 0.8246211289259828, + "grad_norm": 2.2846436500549316, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.709397554397583, + "num_tokens": 190209632.0, + "step": 7509 + }, + { + "epoch": 0.8247309466285966, + "grad_norm": 2.29539155960083, + "learning_rate": 1e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.7103272080421448, + "num_tokens": 190233599.0, + "step": 7510 + }, + { + "epoch": 0.8248407643312102, + "grad_norm": 2.1000614166259766, + "learning_rate": 1e-06, + "loss": 0.9974, + "mean_token_accuracy": 0.69343501329422, + "num_tokens": 190261257.0, + "step": 7511 + }, + { + "epoch": 0.8249505820338239, + "grad_norm": 2.2473316192626953, + "learning_rate": 1e-06, + "loss": 1.0064, + "mean_token_accuracy": 0.6941351890563965, + "num_tokens": 190285852.0, + "step": 7512 + }, + { + "epoch": 0.8250603997364375, + "grad_norm": 2.5119760036468506, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7246617078781128, + "num_tokens": 190305378.0, + "step": 7513 + }, + { + "epoch": 0.8251702174390512, + "grad_norm": 2.1277332305908203, + "learning_rate": 1e-06, + "loss": 1.0616, + "mean_token_accuracy": 0.676596462726593, + "num_tokens": 190333883.0, + "step": 7514 + }, + { + "epoch": 0.8252800351416648, + "grad_norm": 2.4676482677459717, + "learning_rate": 1e-06, + "loss": 0.9308, + "mean_token_accuracy": 0.7137730121612549, + "num_tokens": 190354469.0, + "step": 7515 + }, + { + "epoch": 0.8253898528442785, + "grad_norm": 2.1740245819091797, + "learning_rate": 1e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.7100281715393066, + "num_tokens": 190383283.0, + "step": 7516 + }, + { + "epoch": 0.8254996705468921, + "grad_norm": 2.019303560256958, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.7059119939804077, + "num_tokens": 190411014.0, + "step": 7517 + }, + { + "epoch": 0.8256094882495059, + "grad_norm": 2.173588514328003, + "learning_rate": 1e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.6958005428314209, + "num_tokens": 190435962.0, + "step": 7518 + }, + { + "epoch": 0.8257193059521195, + "grad_norm": 2.1265666484832764, + "learning_rate": 1e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.6924507021903992, + "num_tokens": 190462887.0, + "step": 7519 + }, + { + "epoch": 0.8258291236547332, + "grad_norm": 2.2232892513275146, + "learning_rate": 1e-06, + "loss": 1.0358, + "mean_token_accuracy": 0.6934738755226135, + "num_tokens": 190486823.0, + "step": 7520 + }, + { + "epoch": 0.8259389413573468, + "grad_norm": 2.1613681316375732, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.7035561800003052, + "num_tokens": 190513225.0, + "step": 7521 + }, + { + "epoch": 0.8260487590599604, + "grad_norm": 2.2854204177856445, + "learning_rate": 1e-06, + "loss": 1.0031, + "mean_token_accuracy": 0.693724513053894, + "num_tokens": 190537275.0, + "step": 7522 + }, + { + "epoch": 0.8261585767625741, + "grad_norm": 2.2527382373809814, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7115004658699036, + "num_tokens": 190560798.0, + "step": 7523 + }, + { + "epoch": 0.8262683944651877, + "grad_norm": 2.24938702583313, + "learning_rate": 1e-06, + "loss": 0.8942, + "mean_token_accuracy": 0.7290191650390625, + "num_tokens": 190584104.0, + "step": 7524 + }, + { + "epoch": 0.8263782121678015, + "grad_norm": 2.3629136085510254, + "learning_rate": 1e-06, + "loss": 0.8885, + "mean_token_accuracy": 0.728114902973175, + "num_tokens": 190605457.0, + "step": 7525 + }, + { + "epoch": 0.8264880298704151, + "grad_norm": 2.1968698501586914, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.7093343138694763, + "num_tokens": 190631204.0, + "step": 7526 + }, + { + "epoch": 0.8265978475730288, + "grad_norm": 2.1405413150787354, + "learning_rate": 1e-06, + "loss": 0.9723, + "mean_token_accuracy": 0.7059832215309143, + "num_tokens": 190658409.0, + "step": 7527 + }, + { + "epoch": 0.8267076652756424, + "grad_norm": 2.394028425216675, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7324504852294922, + "num_tokens": 190682594.0, + "step": 7528 + }, + { + "epoch": 0.8268174829782561, + "grad_norm": 1.860351800918579, + "learning_rate": 1e-06, + "loss": 1.0582, + "mean_token_accuracy": 0.6807629466056824, + "num_tokens": 190718400.0, + "step": 7529 + }, + { + "epoch": 0.8269273006808697, + "grad_norm": 2.2833356857299805, + "learning_rate": 1e-06, + "loss": 0.9787, + "mean_token_accuracy": 0.6994436979293823, + "num_tokens": 190741670.0, + "step": 7530 + }, + { + "epoch": 0.8270371183834834, + "grad_norm": 1.9520297050476074, + "learning_rate": 1e-06, + "loss": 0.9567, + "mean_token_accuracy": 0.7068113684654236, + "num_tokens": 190773080.0, + "step": 7531 + }, + { + "epoch": 0.8271469360860971, + "grad_norm": 2.0487797260284424, + "learning_rate": 1e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.7176318168640137, + "num_tokens": 190799692.0, + "step": 7532 + }, + { + "epoch": 0.8272567537887108, + "grad_norm": 2.1827950477600098, + "learning_rate": 1e-06, + "loss": 0.99, + "mean_token_accuracy": 0.6948360204696655, + "num_tokens": 190826187.0, + "step": 7533 + }, + { + "epoch": 0.8273665714913244, + "grad_norm": 2.160926103591919, + "learning_rate": 1e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.6980158090591431, + "num_tokens": 190851533.0, + "step": 7534 + }, + { + "epoch": 0.8274763891939381, + "grad_norm": 2.57084584236145, + "learning_rate": 1e-06, + "loss": 0.8772, + "mean_token_accuracy": 0.7223591804504395, + "num_tokens": 190869980.0, + "step": 7535 + }, + { + "epoch": 0.8275862068965517, + "grad_norm": 2.1076595783233643, + "learning_rate": 1e-06, + "loss": 0.9595, + "mean_token_accuracy": 0.7158894538879395, + "num_tokens": 190895181.0, + "step": 7536 + }, + { + "epoch": 0.8276960245991654, + "grad_norm": 2.6820180416107178, + "learning_rate": 1e-06, + "loss": 0.859, + "mean_token_accuracy": 0.7304860353469849, + "num_tokens": 190912620.0, + "step": 7537 + }, + { + "epoch": 0.827805842301779, + "grad_norm": 2.027927875518799, + "learning_rate": 1e-06, + "loss": 0.9454, + "mean_token_accuracy": 0.7145581245422363, + "num_tokens": 190943239.0, + "step": 7538 + }, + { + "epoch": 0.8279156600043928, + "grad_norm": 2.025918483734131, + "learning_rate": 1e-06, + "loss": 0.9193, + "mean_token_accuracy": 0.7210286259651184, + "num_tokens": 190970566.0, + "step": 7539 + }, + { + "epoch": 0.8280254777070064, + "grad_norm": 2.103391170501709, + "learning_rate": 1e-06, + "loss": 0.958, + "mean_token_accuracy": 0.7029301524162292, + "num_tokens": 190996978.0, + "step": 7540 + }, + { + "epoch": 0.82813529540962, + "grad_norm": 2.549309730529785, + "learning_rate": 1e-06, + "loss": 0.8792, + "mean_token_accuracy": 0.721306324005127, + "num_tokens": 191016095.0, + "step": 7541 + }, + { + "epoch": 0.8282451131122337, + "grad_norm": 2.0876073837280273, + "learning_rate": 1e-06, + "loss": 0.904, + "mean_token_accuracy": 0.7222321033477783, + "num_tokens": 191041077.0, + "step": 7542 + }, + { + "epoch": 0.8283549308148473, + "grad_norm": 1.8967812061309814, + "learning_rate": 1e-06, + "loss": 0.9027, + "mean_token_accuracy": 0.7183303833007812, + "num_tokens": 191072516.0, + "step": 7543 + }, + { + "epoch": 0.828464748517461, + "grad_norm": 2.2542800903320312, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.7100855112075806, + "num_tokens": 191097488.0, + "step": 7544 + }, + { + "epoch": 0.8285745662200746, + "grad_norm": 2.3486433029174805, + "learning_rate": 1e-06, + "loss": 0.941, + "mean_token_accuracy": 0.7143141031265259, + "num_tokens": 191119943.0, + "step": 7545 + }, + { + "epoch": 0.8286843839226883, + "grad_norm": 2.201625108718872, + "learning_rate": 1e-06, + "loss": 1.04, + "mean_token_accuracy": 0.6869508624076843, + "num_tokens": 191146085.0, + "step": 7546 + }, + { + "epoch": 0.828794201625302, + "grad_norm": 2.006399393081665, + "learning_rate": 1e-06, + "loss": 0.8708, + "mean_token_accuracy": 0.7267568111419678, + "num_tokens": 191172913.0, + "step": 7547 + }, + { + "epoch": 0.8289040193279157, + "grad_norm": 2.273642063140869, + "learning_rate": 1e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.6971302032470703, + "num_tokens": 191197933.0, + "step": 7548 + }, + { + "epoch": 0.8290138370305293, + "grad_norm": 2.0177876949310303, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.6987966299057007, + "num_tokens": 191228213.0, + "step": 7549 + }, + { + "epoch": 0.829123654733143, + "grad_norm": 2.357807159423828, + "learning_rate": 1e-06, + "loss": 0.984, + "mean_token_accuracy": 0.7006950974464417, + "num_tokens": 191250427.0, + "step": 7550 + }, + { + "epoch": 0.8292334724357566, + "grad_norm": 2.2758264541625977, + "learning_rate": 1e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7151976227760315, + "num_tokens": 191274558.0, + "step": 7551 + }, + { + "epoch": 0.8293432901383703, + "grad_norm": 2.274024248123169, + "learning_rate": 1e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.7074469923973083, + "num_tokens": 191298778.0, + "step": 7552 + }, + { + "epoch": 0.8294531078409839, + "grad_norm": 2.0081729888916016, + "learning_rate": 1e-06, + "loss": 0.8601, + "mean_token_accuracy": 0.7370388507843018, + "num_tokens": 191326691.0, + "step": 7553 + }, + { + "epoch": 0.8295629255435977, + "grad_norm": 2.039935350418091, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7196476459503174, + "num_tokens": 191356214.0, + "step": 7554 + }, + { + "epoch": 0.8296727432462113, + "grad_norm": 2.001059055328369, + "learning_rate": 1e-06, + "loss": 0.9841, + "mean_token_accuracy": 0.7009671926498413, + "num_tokens": 191386667.0, + "step": 7555 + }, + { + "epoch": 0.829782560948825, + "grad_norm": 2.3092856407165527, + "learning_rate": 1e-06, + "loss": 0.7996, + "mean_token_accuracy": 0.749104380607605, + "num_tokens": 191408304.0, + "step": 7556 + }, + { + "epoch": 0.8298923786514386, + "grad_norm": 1.9524391889572144, + "learning_rate": 1e-06, + "loss": 0.9935, + "mean_token_accuracy": 0.6958889365196228, + "num_tokens": 191441051.0, + "step": 7557 + }, + { + "epoch": 0.8300021963540523, + "grad_norm": 2.4935452938079834, + "learning_rate": 1e-06, + "loss": 0.9033, + "mean_token_accuracy": 0.7265976667404175, + "num_tokens": 191460381.0, + "step": 7558 + }, + { + "epoch": 0.8301120140566659, + "grad_norm": 2.0894880294799805, + "learning_rate": 1e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.7007507085800171, + "num_tokens": 191488145.0, + "step": 7559 + }, + { + "epoch": 0.8302218317592795, + "grad_norm": 2.0719261169433594, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7102097868919373, + "num_tokens": 191518048.0, + "step": 7560 + }, + { + "epoch": 0.8303316494618933, + "grad_norm": 2.358670949935913, + "learning_rate": 1e-06, + "loss": 0.9523, + "mean_token_accuracy": 0.7146433591842651, + "num_tokens": 191541154.0, + "step": 7561 + }, + { + "epoch": 0.830441467164507, + "grad_norm": 2.4037580490112305, + "learning_rate": 1e-06, + "loss": 1.0726, + "mean_token_accuracy": 0.6774011850357056, + "num_tokens": 191565020.0, + "step": 7562 + }, + { + "epoch": 0.8305512848671206, + "grad_norm": 2.130316734313965, + "learning_rate": 1e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.6887292265892029, + "num_tokens": 191593445.0, + "step": 7563 + }, + { + "epoch": 0.8306611025697342, + "grad_norm": 2.2474453449249268, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.7085405588150024, + "num_tokens": 191616294.0, + "step": 7564 + }, + { + "epoch": 0.8307709202723479, + "grad_norm": 2.1512234210968018, + "learning_rate": 1e-06, + "loss": 0.9004, + "mean_token_accuracy": 0.7190209627151489, + "num_tokens": 191641858.0, + "step": 7565 + }, + { + "epoch": 0.8308807379749615, + "grad_norm": 2.26418137550354, + "learning_rate": 1e-06, + "loss": 0.9493, + "mean_token_accuracy": 0.7097734808921814, + "num_tokens": 191665629.0, + "step": 7566 + }, + { + "epoch": 0.8309905556775752, + "grad_norm": 1.9448935985565186, + "learning_rate": 1e-06, + "loss": 1.0039, + "mean_token_accuracy": 0.6890318393707275, + "num_tokens": 191698537.0, + "step": 7567 + }, + { + "epoch": 0.8311003733801889, + "grad_norm": 2.2531585693359375, + "learning_rate": 1e-06, + "loss": 0.8148, + "mean_token_accuracy": 0.7456562519073486, + "num_tokens": 191721469.0, + "step": 7568 + }, + { + "epoch": 0.8312101910828026, + "grad_norm": 2.136608362197876, + "learning_rate": 1e-06, + "loss": 1.019, + "mean_token_accuracy": 0.688948929309845, + "num_tokens": 191748481.0, + "step": 7569 + }, + { + "epoch": 0.8313200087854162, + "grad_norm": 2.6734282970428467, + "learning_rate": 1e-06, + "loss": 0.8887, + "mean_token_accuracy": 0.7225066423416138, + "num_tokens": 191765002.0, + "step": 7570 + }, + { + "epoch": 0.8314298264880299, + "grad_norm": 2.0733110904693604, + "learning_rate": 1e-06, + "loss": 1.048, + "mean_token_accuracy": 0.6893417835235596, + "num_tokens": 191794516.0, + "step": 7571 + }, + { + "epoch": 0.8315396441906435, + "grad_norm": 2.1591641902923584, + "learning_rate": 1e-06, + "loss": 1.021, + "mean_token_accuracy": 0.6913289427757263, + "num_tokens": 191819607.0, + "step": 7572 + }, + { + "epoch": 0.8316494618932572, + "grad_norm": 2.3512120246887207, + "learning_rate": 1e-06, + "loss": 1.057, + "mean_token_accuracy": 0.6914756298065186, + "num_tokens": 191844170.0, + "step": 7573 + }, + { + "epoch": 0.8317592795958708, + "grad_norm": 2.2283778190612793, + "learning_rate": 1e-06, + "loss": 0.8927, + "mean_token_accuracy": 0.716759443283081, + "num_tokens": 191869009.0, + "step": 7574 + }, + { + "epoch": 0.8318690972984845, + "grad_norm": 2.1639041900634766, + "learning_rate": 1e-06, + "loss": 1.0261, + "mean_token_accuracy": 0.6842906475067139, + "num_tokens": 191896359.0, + "step": 7575 + }, + { + "epoch": 0.8319789150010982, + "grad_norm": 2.2205045223236084, + "learning_rate": 1e-06, + "loss": 0.9162, + "mean_token_accuracy": 0.7171831130981445, + "num_tokens": 191922904.0, + "step": 7576 + }, + { + "epoch": 0.8320887327037119, + "grad_norm": 2.29581880569458, + "learning_rate": 1e-06, + "loss": 1.0088, + "mean_token_accuracy": 0.6992490291595459, + "num_tokens": 191946537.0, + "step": 7577 + }, + { + "epoch": 0.8321985504063255, + "grad_norm": 2.2326500415802, + "learning_rate": 1e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.7049011588096619, + "num_tokens": 191972194.0, + "step": 7578 + }, + { + "epoch": 0.8323083681089392, + "grad_norm": 2.1536381244659424, + "learning_rate": 1e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.7126839756965637, + "num_tokens": 191998586.0, + "step": 7579 + }, + { + "epoch": 0.8324181858115528, + "grad_norm": 2.087080240249634, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.7158488035202026, + "num_tokens": 192023790.0, + "step": 7580 + }, + { + "epoch": 0.8325280035141664, + "grad_norm": 2.3405516147613525, + "learning_rate": 1e-06, + "loss": 0.9188, + "mean_token_accuracy": 0.7114794254302979, + "num_tokens": 192044632.0, + "step": 7581 + }, + { + "epoch": 0.8326378212167801, + "grad_norm": 2.1931636333465576, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.7018566727638245, + "num_tokens": 192070171.0, + "step": 7582 + }, + { + "epoch": 0.8327476389193939, + "grad_norm": 2.336867332458496, + "learning_rate": 1e-06, + "loss": 0.8905, + "mean_token_accuracy": 0.7269539833068848, + "num_tokens": 192092052.0, + "step": 7583 + }, + { + "epoch": 0.8328574566220075, + "grad_norm": 1.9999220371246338, + "learning_rate": 1e-06, + "loss": 1.105, + "mean_token_accuracy": 0.6672114729881287, + "num_tokens": 192123533.0, + "step": 7584 + }, + { + "epoch": 0.8329672743246211, + "grad_norm": 2.5766124725341797, + "learning_rate": 1e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.7255895137786865, + "num_tokens": 192142645.0, + "step": 7585 + }, + { + "epoch": 0.8330770920272348, + "grad_norm": 2.2265708446502686, + "learning_rate": 1e-06, + "loss": 1.0238, + "mean_token_accuracy": 0.6869521141052246, + "num_tokens": 192166529.0, + "step": 7586 + }, + { + "epoch": 0.8331869097298484, + "grad_norm": 2.1418087482452393, + "learning_rate": 1e-06, + "loss": 1.002, + "mean_token_accuracy": 0.6909198760986328, + "num_tokens": 192192776.0, + "step": 7587 + }, + { + "epoch": 0.8332967274324621, + "grad_norm": 2.0045950412750244, + "learning_rate": 1e-06, + "loss": 1.0043, + "mean_token_accuracy": 0.6951634287834167, + "num_tokens": 192221255.0, + "step": 7588 + }, + { + "epoch": 0.8334065451350757, + "grad_norm": 2.0337488651275635, + "learning_rate": 1e-06, + "loss": 1.0123, + "mean_token_accuracy": 0.6981546878814697, + "num_tokens": 192249593.0, + "step": 7589 + }, + { + "epoch": 0.8335163628376895, + "grad_norm": 2.0597074031829834, + "learning_rate": 1e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.7062972784042358, + "num_tokens": 192278031.0, + "step": 7590 + }, + { + "epoch": 0.8336261805403031, + "grad_norm": 2.182697057723999, + "learning_rate": 1e-06, + "loss": 0.9989, + "mean_token_accuracy": 0.6964545249938965, + "num_tokens": 192303344.0, + "step": 7591 + }, + { + "epoch": 0.8337359982429168, + "grad_norm": 2.0725858211517334, + "learning_rate": 1e-06, + "loss": 0.9943, + "mean_token_accuracy": 0.7027611136436462, + "num_tokens": 192331510.0, + "step": 7592 + }, + { + "epoch": 0.8338458159455304, + "grad_norm": 2.225878953933716, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7095960378646851, + "num_tokens": 192356783.0, + "step": 7593 + }, + { + "epoch": 0.8339556336481441, + "grad_norm": 2.1944875717163086, + "learning_rate": 1e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.7214677333831787, + "num_tokens": 192379722.0, + "step": 7594 + }, + { + "epoch": 0.8340654513507577, + "grad_norm": 2.212022542953491, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7103439569473267, + "num_tokens": 192403760.0, + "step": 7595 + }, + { + "epoch": 0.8341752690533714, + "grad_norm": 2.0043156147003174, + "learning_rate": 1e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.7087730169296265, + "num_tokens": 192432414.0, + "step": 7596 + }, + { + "epoch": 0.8342850867559851, + "grad_norm": 1.8402727842330933, + "learning_rate": 1e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.7075803279876709, + "num_tokens": 192464313.0, + "step": 7597 + }, + { + "epoch": 0.8343949044585988, + "grad_norm": 2.0270042419433594, + "learning_rate": 1e-06, + "loss": 1.0049, + "mean_token_accuracy": 0.6963954567909241, + "num_tokens": 192492861.0, + "step": 7598 + }, + { + "epoch": 0.8345047221612124, + "grad_norm": 2.5088977813720703, + "learning_rate": 1e-06, + "loss": 0.8886, + "mean_token_accuracy": 0.7323394417762756, + "num_tokens": 192512826.0, + "step": 7599 + }, + { + "epoch": 0.834614539863826, + "grad_norm": 2.1194090843200684, + "learning_rate": 1e-06, + "loss": 0.9498, + "mean_token_accuracy": 0.7056453227996826, + "num_tokens": 192539894.0, + "step": 7600 + }, + { + "epoch": 0.8347243575664397, + "grad_norm": 2.236839532852173, + "learning_rate": 1e-06, + "loss": 1.0534, + "mean_token_accuracy": 0.6893250346183777, + "num_tokens": 192565120.0, + "step": 7601 + }, + { + "epoch": 0.8348341752690533, + "grad_norm": 2.2940964698791504, + "learning_rate": 1e-06, + "loss": 0.9272, + "mean_token_accuracy": 0.7221230268478394, + "num_tokens": 192588050.0, + "step": 7602 + }, + { + "epoch": 0.834943992971667, + "grad_norm": 2.3533082008361816, + "learning_rate": 1e-06, + "loss": 0.8391, + "mean_token_accuracy": 0.7305254936218262, + "num_tokens": 192609697.0, + "step": 7603 + }, + { + "epoch": 0.8350538106742806, + "grad_norm": 2.0349910259246826, + "learning_rate": 1e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.6972152590751648, + "num_tokens": 192638681.0, + "step": 7604 + }, + { + "epoch": 0.8351636283768944, + "grad_norm": 2.227952718734741, + "learning_rate": 1e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7090449929237366, + "num_tokens": 192662712.0, + "step": 7605 + }, + { + "epoch": 0.835273446079508, + "grad_norm": 2.3310115337371826, + "learning_rate": 1e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7184683680534363, + "num_tokens": 192684708.0, + "step": 7606 + }, + { + "epoch": 0.8353832637821217, + "grad_norm": 2.194063425064087, + "learning_rate": 1e-06, + "loss": 1.0175, + "mean_token_accuracy": 0.688499927520752, + "num_tokens": 192713035.0, + "step": 7607 + }, + { + "epoch": 0.8354930814847353, + "grad_norm": 2.276672601699829, + "learning_rate": 1e-06, + "loss": 1.0004, + "mean_token_accuracy": 0.7017074227333069, + "num_tokens": 192738170.0, + "step": 7608 + }, + { + "epoch": 0.835602899187349, + "grad_norm": 2.189911365509033, + "learning_rate": 1e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.692592978477478, + "num_tokens": 192763634.0, + "step": 7609 + }, + { + "epoch": 0.8357127168899626, + "grad_norm": 2.1620798110961914, + "learning_rate": 1e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.7092280983924866, + "num_tokens": 192790343.0, + "step": 7610 + }, + { + "epoch": 0.8358225345925763, + "grad_norm": 2.017758369445801, + "learning_rate": 1e-06, + "loss": 0.9441, + "mean_token_accuracy": 0.7133709192276001, + "num_tokens": 192817734.0, + "step": 7611 + }, + { + "epoch": 0.83593235229519, + "grad_norm": 2.4745583534240723, + "learning_rate": 1e-06, + "loss": 0.9568, + "mean_token_accuracy": 0.7000081539154053, + "num_tokens": 192841214.0, + "step": 7612 + }, + { + "epoch": 0.8360421699978037, + "grad_norm": 2.2518115043640137, + "learning_rate": 1e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.6947811245918274, + "num_tokens": 192865473.0, + "step": 7613 + }, + { + "epoch": 0.8361519877004173, + "grad_norm": 2.5304274559020996, + "learning_rate": 1e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.7103056907653809, + "num_tokens": 192886360.0, + "step": 7614 + }, + { + "epoch": 0.836261805403031, + "grad_norm": 2.1398744583129883, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.7080652117729187, + "num_tokens": 192913933.0, + "step": 7615 + }, + { + "epoch": 0.8363716231056446, + "grad_norm": 2.059607744216919, + "learning_rate": 1e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7171636819839478, + "num_tokens": 192943488.0, + "step": 7616 + }, + { + "epoch": 0.8364814408082583, + "grad_norm": 2.268939733505249, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.7069496512413025, + "num_tokens": 192968368.0, + "step": 7617 + }, + { + "epoch": 0.8365912585108719, + "grad_norm": 2.27687668800354, + "learning_rate": 1e-06, + "loss": 0.8047, + "mean_token_accuracy": 0.7485694885253906, + "num_tokens": 192990570.0, + "step": 7618 + }, + { + "epoch": 0.8367010762134857, + "grad_norm": 2.22871470451355, + "learning_rate": 1e-06, + "loss": 0.8656, + "mean_token_accuracy": 0.7254716753959656, + "num_tokens": 193013729.0, + "step": 7619 + }, + { + "epoch": 0.8368108939160993, + "grad_norm": 2.1084280014038086, + "learning_rate": 1e-06, + "loss": 0.9198, + "mean_token_accuracy": 0.7154741883277893, + "num_tokens": 193040265.0, + "step": 7620 + }, + { + "epoch": 0.836920711618713, + "grad_norm": 2.46394419670105, + "learning_rate": 1e-06, + "loss": 0.995, + "mean_token_accuracy": 0.7003111839294434, + "num_tokens": 193063083.0, + "step": 7621 + }, + { + "epoch": 0.8370305293213266, + "grad_norm": 2.1624178886413574, + "learning_rate": 1e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.7153811454772949, + "num_tokens": 193088641.0, + "step": 7622 + }, + { + "epoch": 0.8371403470239402, + "grad_norm": 2.368537664413452, + "learning_rate": 1e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.6976889371871948, + "num_tokens": 193111777.0, + "step": 7623 + }, + { + "epoch": 0.8372501647265539, + "grad_norm": 2.550948143005371, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.72187739610672, + "num_tokens": 193132473.0, + "step": 7624 + }, + { + "epoch": 0.8373599824291675, + "grad_norm": 2.47613787651062, + "learning_rate": 1e-06, + "loss": 0.9079, + "mean_token_accuracy": 0.7200513482093811, + "num_tokens": 193153764.0, + "step": 7625 + }, + { + "epoch": 0.8374698001317813, + "grad_norm": 2.248328447341919, + "learning_rate": 1e-06, + "loss": 0.884, + "mean_token_accuracy": 0.729759693145752, + "num_tokens": 193177324.0, + "step": 7626 + }, + { + "epoch": 0.8375796178343949, + "grad_norm": 2.4113664627075195, + "learning_rate": 1e-06, + "loss": 0.8286, + "mean_token_accuracy": 0.7370043396949768, + "num_tokens": 193196776.0, + "step": 7627 + }, + { + "epoch": 0.8376894355370086, + "grad_norm": 2.2227611541748047, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.6976046562194824, + "num_tokens": 193221562.0, + "step": 7628 + }, + { + "epoch": 0.8377992532396222, + "grad_norm": 1.9999775886535645, + "learning_rate": 1e-06, + "loss": 0.8451, + "mean_token_accuracy": 0.737713098526001, + "num_tokens": 193250104.0, + "step": 7629 + }, + { + "epoch": 0.8379090709422359, + "grad_norm": 2.4028799533843994, + "learning_rate": 1e-06, + "loss": 0.8796, + "mean_token_accuracy": 0.726362943649292, + "num_tokens": 193270919.0, + "step": 7630 + }, + { + "epoch": 0.8380188886448495, + "grad_norm": 2.2014851570129395, + "learning_rate": 1e-06, + "loss": 1.0826, + "mean_token_accuracy": 0.6719912886619568, + "num_tokens": 193297610.0, + "step": 7631 + }, + { + "epoch": 0.8381287063474632, + "grad_norm": 2.2159175872802734, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.7255876660346985, + "num_tokens": 193321451.0, + "step": 7632 + }, + { + "epoch": 0.8382385240500769, + "grad_norm": 2.113633871078491, + "learning_rate": 1e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.7191472053527832, + "num_tokens": 193348912.0, + "step": 7633 + }, + { + "epoch": 0.8383483417526906, + "grad_norm": 2.332655429840088, + "learning_rate": 1e-06, + "loss": 0.984, + "mean_token_accuracy": 0.707846462726593, + "num_tokens": 193372559.0, + "step": 7634 + }, + { + "epoch": 0.8384581594553042, + "grad_norm": 2.2281644344329834, + "learning_rate": 1e-06, + "loss": 0.9838, + "mean_token_accuracy": 0.7078169584274292, + "num_tokens": 193398837.0, + "step": 7635 + }, + { + "epoch": 0.8385679771579179, + "grad_norm": 2.003838300704956, + "learning_rate": 1e-06, + "loss": 0.9837, + "mean_token_accuracy": 0.7078042030334473, + "num_tokens": 193429808.0, + "step": 7636 + }, + { + "epoch": 0.8386777948605315, + "grad_norm": 1.9934096336364746, + "learning_rate": 1e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.7056452035903931, + "num_tokens": 193459621.0, + "step": 7637 + }, + { + "epoch": 0.8387876125631452, + "grad_norm": 2.0131309032440186, + "learning_rate": 1e-06, + "loss": 0.9368, + "mean_token_accuracy": 0.7166041731834412, + "num_tokens": 193489117.0, + "step": 7638 + }, + { + "epoch": 0.8388974302657588, + "grad_norm": 2.1472973823547363, + "learning_rate": 1e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7201635241508484, + "num_tokens": 193514330.0, + "step": 7639 + }, + { + "epoch": 0.8390072479683724, + "grad_norm": 2.2664575576782227, + "learning_rate": 1e-06, + "loss": 1.0039, + "mean_token_accuracy": 0.7005400061607361, + "num_tokens": 193537353.0, + "step": 7640 + }, + { + "epoch": 0.8391170656709862, + "grad_norm": 2.1772756576538086, + "learning_rate": 1e-06, + "loss": 0.8885, + "mean_token_accuracy": 0.7348676919937134, + "num_tokens": 193563285.0, + "step": 7641 + }, + { + "epoch": 0.8392268833735999, + "grad_norm": 2.1937689781188965, + "learning_rate": 1e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.7160176038742065, + "num_tokens": 193591204.0, + "step": 7642 + }, + { + "epoch": 0.8393367010762135, + "grad_norm": 2.228412389755249, + "learning_rate": 1e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.707086443901062, + "num_tokens": 193614812.0, + "step": 7643 + }, + { + "epoch": 0.8394465187788271, + "grad_norm": 2.090884208679199, + "learning_rate": 1e-06, + "loss": 1.0015, + "mean_token_accuracy": 0.6992686986923218, + "num_tokens": 193641488.0, + "step": 7644 + }, + { + "epoch": 0.8395563364814408, + "grad_norm": 2.0572667121887207, + "learning_rate": 1e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.710640013217926, + "num_tokens": 193670257.0, + "step": 7645 + }, + { + "epoch": 0.8396661541840544, + "grad_norm": 2.31829833984375, + "learning_rate": 1e-06, + "loss": 0.9832, + "mean_token_accuracy": 0.7094564437866211, + "num_tokens": 193693145.0, + "step": 7646 + }, + { + "epoch": 0.8397759718866681, + "grad_norm": 2.0722875595092773, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.6974443197250366, + "num_tokens": 193721746.0, + "step": 7647 + }, + { + "epoch": 0.8398857895892818, + "grad_norm": 2.2229552268981934, + "learning_rate": 1e-06, + "loss": 0.96, + "mean_token_accuracy": 0.7069381475448608, + "num_tokens": 193747865.0, + "step": 7648 + }, + { + "epoch": 0.8399956072918955, + "grad_norm": 1.993106722831726, + "learning_rate": 1e-06, + "loss": 0.9891, + "mean_token_accuracy": 0.6954692602157593, + "num_tokens": 193777895.0, + "step": 7649 + }, + { + "epoch": 0.8401054249945091, + "grad_norm": 2.353924036026001, + "learning_rate": 1e-06, + "loss": 0.927, + "mean_token_accuracy": 0.7117721438407898, + "num_tokens": 193800531.0, + "step": 7650 + }, + { + "epoch": 0.8402152426971228, + "grad_norm": 2.20424222946167, + "learning_rate": 1e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.7153043150901794, + "num_tokens": 193826612.0, + "step": 7651 + }, + { + "epoch": 0.8403250603997364, + "grad_norm": 2.167057752609253, + "learning_rate": 1e-06, + "loss": 1.0385, + "mean_token_accuracy": 0.6925293803215027, + "num_tokens": 193853836.0, + "step": 7652 + }, + { + "epoch": 0.8404348781023501, + "grad_norm": 2.4002299308776855, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.7233678698539734, + "num_tokens": 193874384.0, + "step": 7653 + }, + { + "epoch": 0.8405446958049637, + "grad_norm": 2.045041084289551, + "learning_rate": 1e-06, + "loss": 1.0125, + "mean_token_accuracy": 0.6923572421073914, + "num_tokens": 193902008.0, + "step": 7654 + }, + { + "epoch": 0.8406545135075775, + "grad_norm": 2.2956461906433105, + "learning_rate": 1e-06, + "loss": 0.8212, + "mean_token_accuracy": 0.74830561876297, + "num_tokens": 193924574.0, + "step": 7655 + }, + { + "epoch": 0.8407643312101911, + "grad_norm": 2.5115931034088135, + "learning_rate": 1e-06, + "loss": 1.0039, + "mean_token_accuracy": 0.6998347043991089, + "num_tokens": 193945471.0, + "step": 7656 + }, + { + "epoch": 0.8408741489128048, + "grad_norm": 2.170818328857422, + "learning_rate": 1e-06, + "loss": 1.0427, + "mean_token_accuracy": 0.6873116493225098, + "num_tokens": 193970956.0, + "step": 7657 + }, + { + "epoch": 0.8409839666154184, + "grad_norm": 2.202179431915283, + "learning_rate": 1e-06, + "loss": 0.9062, + "mean_token_accuracy": 0.7217848896980286, + "num_tokens": 193993799.0, + "step": 7658 + }, + { + "epoch": 0.8410937843180321, + "grad_norm": 2.3096513748168945, + "learning_rate": 1e-06, + "loss": 0.9832, + "mean_token_accuracy": 0.6934008002281189, + "num_tokens": 194017928.0, + "step": 7659 + }, + { + "epoch": 0.8412036020206457, + "grad_norm": 2.1311192512512207, + "learning_rate": 1e-06, + "loss": 1.0141, + "mean_token_accuracy": 0.692549467086792, + "num_tokens": 194045356.0, + "step": 7660 + }, + { + "epoch": 0.8413134197232593, + "grad_norm": 2.1460347175598145, + "learning_rate": 1e-06, + "loss": 1.0489, + "mean_token_accuracy": 0.6866931319236755, + "num_tokens": 194071955.0, + "step": 7661 + }, + { + "epoch": 0.8414232374258731, + "grad_norm": 2.6351094245910645, + "learning_rate": 1e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7093240022659302, + "num_tokens": 194089659.0, + "step": 7662 + }, + { + "epoch": 0.8415330551284868, + "grad_norm": 2.138998508453369, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.719012975692749, + "num_tokens": 194117853.0, + "step": 7663 + }, + { + "epoch": 0.8416428728311004, + "grad_norm": 2.3255646228790283, + "learning_rate": 1e-06, + "loss": 0.8902, + "mean_token_accuracy": 0.7217161059379578, + "num_tokens": 194139815.0, + "step": 7664 + }, + { + "epoch": 0.841752690533714, + "grad_norm": 2.0921733379364014, + "learning_rate": 1e-06, + "loss": 0.9225, + "mean_token_accuracy": 0.7108969688415527, + "num_tokens": 194167132.0, + "step": 7665 + }, + { + "epoch": 0.8418625082363277, + "grad_norm": 2.1148600578308105, + "learning_rate": 1e-06, + "loss": 0.9787, + "mean_token_accuracy": 0.7036664485931396, + "num_tokens": 194193738.0, + "step": 7666 + }, + { + "epoch": 0.8419723259389413, + "grad_norm": 2.220846652984619, + "learning_rate": 1e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.7186658978462219, + "num_tokens": 194218054.0, + "step": 7667 + }, + { + "epoch": 0.842082143641555, + "grad_norm": 2.4134721755981445, + "learning_rate": 1e-06, + "loss": 1.0323, + "mean_token_accuracy": 0.6925735473632812, + "num_tokens": 194241414.0, + "step": 7668 + }, + { + "epoch": 0.8421919613441686, + "grad_norm": 2.007361888885498, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7034032940864563, + "num_tokens": 194270357.0, + "step": 7669 + }, + { + "epoch": 0.8423017790467824, + "grad_norm": 2.3901994228363037, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.7043973803520203, + "num_tokens": 194293071.0, + "step": 7670 + }, + { + "epoch": 0.842411596749396, + "grad_norm": 2.124211072921753, + "learning_rate": 1e-06, + "loss": 0.8885, + "mean_token_accuracy": 0.7199493646621704, + "num_tokens": 194319523.0, + "step": 7671 + }, + { + "epoch": 0.8425214144520097, + "grad_norm": 2.3193724155426025, + "learning_rate": 1e-06, + "loss": 1.0237, + "mean_token_accuracy": 0.6969939470291138, + "num_tokens": 194344213.0, + "step": 7672 + }, + { + "epoch": 0.8426312321546233, + "grad_norm": 2.1335511207580566, + "learning_rate": 1e-06, + "loss": 0.992, + "mean_token_accuracy": 0.6961753964424133, + "num_tokens": 194372044.0, + "step": 7673 + }, + { + "epoch": 0.842741049857237, + "grad_norm": 2.3837316036224365, + "learning_rate": 1e-06, + "loss": 0.9079, + "mean_token_accuracy": 0.7255859375, + "num_tokens": 194395672.0, + "step": 7674 + }, + { + "epoch": 0.8428508675598506, + "grad_norm": 2.1350655555725098, + "learning_rate": 1e-06, + "loss": 0.8255, + "mean_token_accuracy": 0.7397098541259766, + "num_tokens": 194420363.0, + "step": 7675 + }, + { + "epoch": 0.8429606852624643, + "grad_norm": 2.261929988861084, + "learning_rate": 1e-06, + "loss": 1.014, + "mean_token_accuracy": 0.6937686204910278, + "num_tokens": 194445979.0, + "step": 7676 + }, + { + "epoch": 0.843070502965078, + "grad_norm": 2.4741313457489014, + "learning_rate": 1e-06, + "loss": 1.0028, + "mean_token_accuracy": 0.6899559497833252, + "num_tokens": 194467378.0, + "step": 7677 + }, + { + "epoch": 0.8431803206676917, + "grad_norm": 2.1464803218841553, + "learning_rate": 1e-06, + "loss": 0.9281, + "mean_token_accuracy": 0.7179583311080933, + "num_tokens": 194491628.0, + "step": 7678 + }, + { + "epoch": 0.8432901383703053, + "grad_norm": 2.1456992626190186, + "learning_rate": 1e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.7123000621795654, + "num_tokens": 194515921.0, + "step": 7679 + }, + { + "epoch": 0.843399956072919, + "grad_norm": 2.0788681507110596, + "learning_rate": 1e-06, + "loss": 0.8826, + "mean_token_accuracy": 0.7239391803741455, + "num_tokens": 194541184.0, + "step": 7680 + }, + { + "epoch": 0.8435097737755326, + "grad_norm": 2.3854894638061523, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7070809006690979, + "num_tokens": 194561356.0, + "step": 7681 + }, + { + "epoch": 0.8436195914781462, + "grad_norm": 2.5659408569335938, + "learning_rate": 1e-06, + "loss": 0.7864, + "mean_token_accuracy": 0.754216194152832, + "num_tokens": 194578269.0, + "step": 7682 + }, + { + "epoch": 0.8437294091807599, + "grad_norm": 2.2821388244628906, + "learning_rate": 1e-06, + "loss": 0.8999, + "mean_token_accuracy": 0.7234170436859131, + "num_tokens": 194600550.0, + "step": 7683 + }, + { + "epoch": 0.8438392268833736, + "grad_norm": 1.9402079582214355, + "learning_rate": 1e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7073160409927368, + "num_tokens": 194631014.0, + "step": 7684 + }, + { + "epoch": 0.8439490445859873, + "grad_norm": 2.314804792404175, + "learning_rate": 1e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.6959377527236938, + "num_tokens": 194655195.0, + "step": 7685 + }, + { + "epoch": 0.8440588622886009, + "grad_norm": 2.4727461338043213, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7073774337768555, + "num_tokens": 194675652.0, + "step": 7686 + }, + { + "epoch": 0.8441686799912146, + "grad_norm": 2.0949881076812744, + "learning_rate": 1e-06, + "loss": 0.9995, + "mean_token_accuracy": 0.6951172947883606, + "num_tokens": 194702773.0, + "step": 7687 + }, + { + "epoch": 0.8442784976938282, + "grad_norm": 2.282087802886963, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7092382907867432, + "num_tokens": 194726606.0, + "step": 7688 + }, + { + "epoch": 0.8443883153964419, + "grad_norm": 2.187601089477539, + "learning_rate": 1e-06, + "loss": 0.8649, + "mean_token_accuracy": 0.7314861416816711, + "num_tokens": 194752124.0, + "step": 7689 + }, + { + "epoch": 0.8444981330990555, + "grad_norm": 2.4513092041015625, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7106461524963379, + "num_tokens": 194772418.0, + "step": 7690 + }, + { + "epoch": 0.8446079508016693, + "grad_norm": 2.302814483642578, + "learning_rate": 1e-06, + "loss": 1.0157, + "mean_token_accuracy": 0.6851385831832886, + "num_tokens": 194796993.0, + "step": 7691 + }, + { + "epoch": 0.8447177685042829, + "grad_norm": 2.208047389984131, + "learning_rate": 1e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.7156988382339478, + "num_tokens": 194823917.0, + "step": 7692 + }, + { + "epoch": 0.8448275862068966, + "grad_norm": 2.0028235912323, + "learning_rate": 1e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.6981253623962402, + "num_tokens": 194853952.0, + "step": 7693 + }, + { + "epoch": 0.8449374039095102, + "grad_norm": 2.0995113849639893, + "learning_rate": 1e-06, + "loss": 0.9082, + "mean_token_accuracy": 0.7223232388496399, + "num_tokens": 194880838.0, + "step": 7694 + }, + { + "epoch": 0.8450472216121239, + "grad_norm": 2.7220823764801025, + "learning_rate": 1e-06, + "loss": 0.8781, + "mean_token_accuracy": 0.7268279194831848, + "num_tokens": 194897077.0, + "step": 7695 + }, + { + "epoch": 0.8451570393147375, + "grad_norm": 2.1276631355285645, + "learning_rate": 1e-06, + "loss": 0.8668, + "mean_token_accuracy": 0.7329614162445068, + "num_tokens": 194922570.0, + "step": 7696 + }, + { + "epoch": 0.8452668570173512, + "grad_norm": 2.345305919647217, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.7181426286697388, + "num_tokens": 194945197.0, + "step": 7697 + }, + { + "epoch": 0.8453766747199648, + "grad_norm": 2.1573662757873535, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.7160630822181702, + "num_tokens": 194971271.0, + "step": 7698 + }, + { + "epoch": 0.8454864924225786, + "grad_norm": 2.02178692817688, + "learning_rate": 1e-06, + "loss": 1.0334, + "mean_token_accuracy": 0.6879711151123047, + "num_tokens": 195001782.0, + "step": 7699 + }, + { + "epoch": 0.8455963101251922, + "grad_norm": 2.1387650966644287, + "learning_rate": 1e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.7091682553291321, + "num_tokens": 195027148.0, + "step": 7700 + }, + { + "epoch": 0.8457061278278059, + "grad_norm": 2.0191380977630615, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.6992346048355103, + "num_tokens": 195056126.0, + "step": 7701 + }, + { + "epoch": 0.8458159455304195, + "grad_norm": 2.2834932804107666, + "learning_rate": 1e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.7282514572143555, + "num_tokens": 195078322.0, + "step": 7702 + }, + { + "epoch": 0.8459257632330331, + "grad_norm": 2.134448289871216, + "learning_rate": 1e-06, + "loss": 0.8794, + "mean_token_accuracy": 0.7225755453109741, + "num_tokens": 195103981.0, + "step": 7703 + }, + { + "epoch": 0.8460355809356468, + "grad_norm": 2.2950172424316406, + "learning_rate": 1e-06, + "loss": 0.9884, + "mean_token_accuracy": 0.7036929130554199, + "num_tokens": 195128048.0, + "step": 7704 + }, + { + "epoch": 0.8461453986382604, + "grad_norm": 1.8640893697738647, + "learning_rate": 1e-06, + "loss": 0.9441, + "mean_token_accuracy": 0.7067981958389282, + "num_tokens": 195161675.0, + "step": 7705 + }, + { + "epoch": 0.8462552163408742, + "grad_norm": 1.7689392566680908, + "learning_rate": 1e-06, + "loss": 1.0053, + "mean_token_accuracy": 0.6910943984985352, + "num_tokens": 195199274.0, + "step": 7706 + }, + { + "epoch": 0.8463650340434878, + "grad_norm": 2.178380012512207, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7057437300682068, + "num_tokens": 195222440.0, + "step": 7707 + }, + { + "epoch": 0.8464748517461015, + "grad_norm": 1.965620517730713, + "learning_rate": 1e-06, + "loss": 0.9454, + "mean_token_accuracy": 0.7092058658599854, + "num_tokens": 195251385.0, + "step": 7708 + }, + { + "epoch": 0.8465846694487151, + "grad_norm": 2.282583713531494, + "learning_rate": 1e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.7178306579589844, + "num_tokens": 195274064.0, + "step": 7709 + }, + { + "epoch": 0.8466944871513288, + "grad_norm": 2.234009265899658, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.710124135017395, + "num_tokens": 195299241.0, + "step": 7710 + }, + { + "epoch": 0.8468043048539424, + "grad_norm": 2.1750893592834473, + "learning_rate": 1e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.7155824899673462, + "num_tokens": 195323904.0, + "step": 7711 + }, + { + "epoch": 0.8469141225565561, + "grad_norm": 1.9375942945480347, + "learning_rate": 1e-06, + "loss": 0.9767, + "mean_token_accuracy": 0.7023022770881653, + "num_tokens": 195355871.0, + "step": 7712 + }, + { + "epoch": 0.8470239402591698, + "grad_norm": 2.319307327270508, + "learning_rate": 1e-06, + "loss": 1.0109, + "mean_token_accuracy": 0.6999044418334961, + "num_tokens": 195379526.0, + "step": 7713 + }, + { + "epoch": 0.8471337579617835, + "grad_norm": 2.1533470153808594, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7147485017776489, + "num_tokens": 195404157.0, + "step": 7714 + }, + { + "epoch": 0.8472435756643971, + "grad_norm": 2.2349348068237305, + "learning_rate": 1e-06, + "loss": 0.9701, + "mean_token_accuracy": 0.7062729597091675, + "num_tokens": 195429262.0, + "step": 7715 + }, + { + "epoch": 0.8473533933670108, + "grad_norm": 1.9980595111846924, + "learning_rate": 1e-06, + "loss": 0.9733, + "mean_token_accuracy": 0.707269549369812, + "num_tokens": 195458910.0, + "step": 7716 + }, + { + "epoch": 0.8474632110696244, + "grad_norm": 2.339817762374878, + "learning_rate": 1e-06, + "loss": 0.998, + "mean_token_accuracy": 0.6970065832138062, + "num_tokens": 195484151.0, + "step": 7717 + }, + { + "epoch": 0.8475730287722381, + "grad_norm": 2.1154041290283203, + "learning_rate": 1e-06, + "loss": 1.0181, + "mean_token_accuracy": 0.6892242431640625, + "num_tokens": 195510855.0, + "step": 7718 + }, + { + "epoch": 0.8476828464748517, + "grad_norm": 2.0485146045684814, + "learning_rate": 1e-06, + "loss": 0.9065, + "mean_token_accuracy": 0.7233016490936279, + "num_tokens": 195539072.0, + "step": 7719 + }, + { + "epoch": 0.8477926641774655, + "grad_norm": 2.32486629486084, + "learning_rate": 1e-06, + "loss": 0.9154, + "mean_token_accuracy": 0.7212042808532715, + "num_tokens": 195561343.0, + "step": 7720 + }, + { + "epoch": 0.8479024818800791, + "grad_norm": 2.0984690189361572, + "learning_rate": 1e-06, + "loss": 0.9955, + "mean_token_accuracy": 0.6919102072715759, + "num_tokens": 195589109.0, + "step": 7721 + }, + { + "epoch": 0.8480122995826928, + "grad_norm": 2.313076972961426, + "learning_rate": 1e-06, + "loss": 0.8962, + "mean_token_accuracy": 0.7227616310119629, + "num_tokens": 195610143.0, + "step": 7722 + }, + { + "epoch": 0.8481221172853064, + "grad_norm": 2.2163209915161133, + "learning_rate": 1e-06, + "loss": 0.9926, + "mean_token_accuracy": 0.6925943493843079, + "num_tokens": 195637114.0, + "step": 7723 + }, + { + "epoch": 0.84823193498792, + "grad_norm": 2.0675413608551025, + "learning_rate": 1e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.7018024921417236, + "num_tokens": 195662796.0, + "step": 7724 + }, + { + "epoch": 0.8483417526905337, + "grad_norm": 2.1714537143707275, + "learning_rate": 1e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7015849351882935, + "num_tokens": 195689185.0, + "step": 7725 + }, + { + "epoch": 0.8484515703931473, + "grad_norm": 2.350066900253296, + "learning_rate": 1e-06, + "loss": 0.9651, + "mean_token_accuracy": 0.7018508315086365, + "num_tokens": 195712256.0, + "step": 7726 + }, + { + "epoch": 0.848561388095761, + "grad_norm": 2.150020122528076, + "learning_rate": 1e-06, + "loss": 1.0488, + "mean_token_accuracy": 0.6801694631576538, + "num_tokens": 195740496.0, + "step": 7727 + }, + { + "epoch": 0.8486712057983747, + "grad_norm": 2.042712450027466, + "learning_rate": 1e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.7028899192810059, + "num_tokens": 195770187.0, + "step": 7728 + }, + { + "epoch": 0.8487810235009884, + "grad_norm": 2.1588938236236572, + "learning_rate": 1e-06, + "loss": 1.0188, + "mean_token_accuracy": 0.7002813816070557, + "num_tokens": 195796812.0, + "step": 7729 + }, + { + "epoch": 0.848890841203602, + "grad_norm": 1.9577966928482056, + "learning_rate": 1e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7166584134101868, + "num_tokens": 195826084.0, + "step": 7730 + }, + { + "epoch": 0.8490006589062157, + "grad_norm": 2.046895742416382, + "learning_rate": 1e-06, + "loss": 1.0086, + "mean_token_accuracy": 0.6882882118225098, + "num_tokens": 195855327.0, + "step": 7731 + }, + { + "epoch": 0.8491104766088293, + "grad_norm": 2.250129461288452, + "learning_rate": 1e-06, + "loss": 0.879, + "mean_token_accuracy": 0.7196540236473083, + "num_tokens": 195878014.0, + "step": 7732 + }, + { + "epoch": 0.849220294311443, + "grad_norm": 2.510833978652954, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7059808373451233, + "num_tokens": 195898244.0, + "step": 7733 + }, + { + "epoch": 0.8493301120140566, + "grad_norm": 2.5894956588745117, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7220242023468018, + "num_tokens": 195916212.0, + "step": 7734 + }, + { + "epoch": 0.8494399297166704, + "grad_norm": 2.152017116546631, + "learning_rate": 1e-06, + "loss": 1.0765, + "mean_token_accuracy": 0.6808561086654663, + "num_tokens": 195943700.0, + "step": 7735 + }, + { + "epoch": 0.849549747419284, + "grad_norm": 2.1045889854431152, + "learning_rate": 1e-06, + "loss": 1.0017, + "mean_token_accuracy": 0.6963635683059692, + "num_tokens": 195970655.0, + "step": 7736 + }, + { + "epoch": 0.8496595651218977, + "grad_norm": 2.335716485977173, + "learning_rate": 1e-06, + "loss": 0.9586, + "mean_token_accuracy": 0.6999121904373169, + "num_tokens": 195993412.0, + "step": 7737 + }, + { + "epoch": 0.8497693828245113, + "grad_norm": 2.432558059692383, + "learning_rate": 1e-06, + "loss": 0.8505, + "mean_token_accuracy": 0.7322520017623901, + "num_tokens": 196012840.0, + "step": 7738 + }, + { + "epoch": 0.849879200527125, + "grad_norm": 2.2482919692993164, + "learning_rate": 1e-06, + "loss": 0.8825, + "mean_token_accuracy": 0.738811194896698, + "num_tokens": 196035167.0, + "step": 7739 + }, + { + "epoch": 0.8499890182297386, + "grad_norm": 1.9960055351257324, + "learning_rate": 1e-06, + "loss": 1.0366, + "mean_token_accuracy": 0.6938812136650085, + "num_tokens": 196064623.0, + "step": 7740 + }, + { + "epoch": 0.8500988359323522, + "grad_norm": 2.32389497756958, + "learning_rate": 1e-06, + "loss": 0.9596, + "mean_token_accuracy": 0.7051393985748291, + "num_tokens": 196087789.0, + "step": 7741 + }, + { + "epoch": 0.850208653634966, + "grad_norm": 2.2230708599090576, + "learning_rate": 1e-06, + "loss": 0.8522, + "mean_token_accuracy": 0.7302355766296387, + "num_tokens": 196109313.0, + "step": 7742 + }, + { + "epoch": 0.8503184713375797, + "grad_norm": 2.003871202468872, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.7176377773284912, + "num_tokens": 196138424.0, + "step": 7743 + }, + { + "epoch": 0.8504282890401933, + "grad_norm": 2.0540127754211426, + "learning_rate": 1e-06, + "loss": 0.934, + "mean_token_accuracy": 0.726125955581665, + "num_tokens": 196169416.0, + "step": 7744 + }, + { + "epoch": 0.8505381067428069, + "grad_norm": 2.297941207885742, + "learning_rate": 1e-06, + "loss": 1.0607, + "mean_token_accuracy": 0.6862741708755493, + "num_tokens": 196193370.0, + "step": 7745 + }, + { + "epoch": 0.8506479244454206, + "grad_norm": 1.9084010124206543, + "learning_rate": 1e-06, + "loss": 1.0103, + "mean_token_accuracy": 0.6899598836898804, + "num_tokens": 196224718.0, + "step": 7746 + }, + { + "epoch": 0.8507577421480342, + "grad_norm": 2.3854873180389404, + "learning_rate": 1e-06, + "loss": 0.9499, + "mean_token_accuracy": 0.7060030698776245, + "num_tokens": 196245595.0, + "step": 7747 + }, + { + "epoch": 0.8508675598506479, + "grad_norm": 2.208076000213623, + "learning_rate": 1e-06, + "loss": 0.9042, + "mean_token_accuracy": 0.7281078100204468, + "num_tokens": 196267899.0, + "step": 7748 + }, + { + "epoch": 0.8509773775532616, + "grad_norm": 2.5704004764556885, + "learning_rate": 1e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.7102545499801636, + "num_tokens": 196286131.0, + "step": 7749 + }, + { + "epoch": 0.8510871952558753, + "grad_norm": 1.9662507772445679, + "learning_rate": 1e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.7118129730224609, + "num_tokens": 196315838.0, + "step": 7750 + }, + { + "epoch": 0.8511970129584889, + "grad_norm": 2.225212335586548, + "learning_rate": 1e-06, + "loss": 1.017, + "mean_token_accuracy": 0.6943368315696716, + "num_tokens": 196343568.0, + "step": 7751 + }, + { + "epoch": 0.8513068306611026, + "grad_norm": 2.282724142074585, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.6993626356124878, + "num_tokens": 196367614.0, + "step": 7752 + }, + { + "epoch": 0.8514166483637162, + "grad_norm": 1.9104177951812744, + "learning_rate": 1e-06, + "loss": 1.0488, + "mean_token_accuracy": 0.6814069747924805, + "num_tokens": 196400978.0, + "step": 7753 + }, + { + "epoch": 0.8515264660663299, + "grad_norm": 2.244293451309204, + "learning_rate": 1e-06, + "loss": 0.9719, + "mean_token_accuracy": 0.7036490440368652, + "num_tokens": 196426673.0, + "step": 7754 + }, + { + "epoch": 0.8516362837689435, + "grad_norm": 2.587059497833252, + "learning_rate": 1e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.7276979684829712, + "num_tokens": 196445319.0, + "step": 7755 + }, + { + "epoch": 0.8517461014715572, + "grad_norm": 2.4146769046783447, + "learning_rate": 1e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.7225797176361084, + "num_tokens": 196465265.0, + "step": 7756 + }, + { + "epoch": 0.8518559191741709, + "grad_norm": 2.179609775543213, + "learning_rate": 1e-06, + "loss": 0.8883, + "mean_token_accuracy": 0.7260003089904785, + "num_tokens": 196490823.0, + "step": 7757 + }, + { + "epoch": 0.8519657368767846, + "grad_norm": 2.1149492263793945, + "learning_rate": 1e-06, + "loss": 0.968, + "mean_token_accuracy": 0.7026146650314331, + "num_tokens": 196519022.0, + "step": 7758 + }, + { + "epoch": 0.8520755545793982, + "grad_norm": 2.512247085571289, + "learning_rate": 1e-06, + "loss": 1.0346, + "mean_token_accuracy": 0.697467565536499, + "num_tokens": 196541237.0, + "step": 7759 + }, + { + "epoch": 0.8521853722820119, + "grad_norm": 2.1566708087921143, + "learning_rate": 1e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.7024157047271729, + "num_tokens": 196570978.0, + "step": 7760 + }, + { + "epoch": 0.8522951899846255, + "grad_norm": 2.5094518661499023, + "learning_rate": 1e-06, + "loss": 1.0175, + "mean_token_accuracy": 0.6886773109436035, + "num_tokens": 196591975.0, + "step": 7761 + }, + { + "epoch": 0.8524050076872391, + "grad_norm": 2.287100315093994, + "learning_rate": 1e-06, + "loss": 1.0589, + "mean_token_accuracy": 0.6742878556251526, + "num_tokens": 196616976.0, + "step": 7762 + }, + { + "epoch": 0.8525148253898528, + "grad_norm": 2.1707935333251953, + "learning_rate": 1e-06, + "loss": 0.975, + "mean_token_accuracy": 0.7061778903007507, + "num_tokens": 196641534.0, + "step": 7763 + }, + { + "epoch": 0.8526246430924665, + "grad_norm": 2.204658031463623, + "learning_rate": 1e-06, + "loss": 0.9944, + "mean_token_accuracy": 0.7016088962554932, + "num_tokens": 196666117.0, + "step": 7764 + }, + { + "epoch": 0.8527344607950802, + "grad_norm": 2.1650044918060303, + "learning_rate": 1e-06, + "loss": 0.976, + "mean_token_accuracy": 0.7044869065284729, + "num_tokens": 196690947.0, + "step": 7765 + }, + { + "epoch": 0.8528442784976938, + "grad_norm": 2.290618419647217, + "learning_rate": 1e-06, + "loss": 1.0554, + "mean_token_accuracy": 0.6837595105171204, + "num_tokens": 196716470.0, + "step": 7766 + }, + { + "epoch": 0.8529540962003075, + "grad_norm": 2.519062042236328, + "learning_rate": 1e-06, + "loss": 0.9992, + "mean_token_accuracy": 0.6951353549957275, + "num_tokens": 196737778.0, + "step": 7767 + }, + { + "epoch": 0.8530639139029211, + "grad_norm": 2.2834248542785645, + "learning_rate": 1e-06, + "loss": 1.0543, + "mean_token_accuracy": 0.6768657565116882, + "num_tokens": 196766132.0, + "step": 7768 + }, + { + "epoch": 0.8531737316055348, + "grad_norm": 2.4610087871551514, + "learning_rate": 1e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.717847466468811, + "num_tokens": 196787794.0, + "step": 7769 + }, + { + "epoch": 0.8532835493081484, + "grad_norm": 2.1711974143981934, + "learning_rate": 1e-06, + "loss": 0.8997, + "mean_token_accuracy": 0.7199134230613708, + "num_tokens": 196811946.0, + "step": 7770 + }, + { + "epoch": 0.8533933670107622, + "grad_norm": 2.2643215656280518, + "learning_rate": 1e-06, + "loss": 0.8942, + "mean_token_accuracy": 0.7181349992752075, + "num_tokens": 196834993.0, + "step": 7771 + }, + { + "epoch": 0.8535031847133758, + "grad_norm": 2.0873048305511475, + "learning_rate": 1e-06, + "loss": 0.9308, + "mean_token_accuracy": 0.7099817991256714, + "num_tokens": 196865021.0, + "step": 7772 + }, + { + "epoch": 0.8536130024159895, + "grad_norm": 2.0978376865386963, + "learning_rate": 1e-06, + "loss": 0.86, + "mean_token_accuracy": 0.7337524890899658, + "num_tokens": 196892055.0, + "step": 7773 + }, + { + "epoch": 0.8537228201186031, + "grad_norm": 2.3200814723968506, + "learning_rate": 1e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.7106451988220215, + "num_tokens": 196916170.0, + "step": 7774 + }, + { + "epoch": 0.8538326378212168, + "grad_norm": 2.043923854827881, + "learning_rate": 1e-06, + "loss": 0.9089, + "mean_token_accuracy": 0.7206456661224365, + "num_tokens": 196945339.0, + "step": 7775 + }, + { + "epoch": 0.8539424555238304, + "grad_norm": 2.292699098587036, + "learning_rate": 1e-06, + "loss": 1.017, + "mean_token_accuracy": 0.6967089772224426, + "num_tokens": 196969079.0, + "step": 7776 + }, + { + "epoch": 0.8540522732264441, + "grad_norm": 2.18493390083313, + "learning_rate": 1e-06, + "loss": 0.916, + "mean_token_accuracy": 0.7142024040222168, + "num_tokens": 196994518.0, + "step": 7777 + }, + { + "epoch": 0.8541620909290578, + "grad_norm": 2.25272536277771, + "learning_rate": 1e-06, + "loss": 0.8132, + "mean_token_accuracy": 0.7470174431800842, + "num_tokens": 197016144.0, + "step": 7778 + }, + { + "epoch": 0.8542719086316715, + "grad_norm": 2.2520594596862793, + "learning_rate": 1e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.6944268345832825, + "num_tokens": 197041250.0, + "step": 7779 + }, + { + "epoch": 0.8543817263342851, + "grad_norm": 1.9552921056747437, + "learning_rate": 1e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.703283429145813, + "num_tokens": 197073084.0, + "step": 7780 + }, + { + "epoch": 0.8544915440368988, + "grad_norm": 2.1124420166015625, + "learning_rate": 1e-06, + "loss": 0.9987, + "mean_token_accuracy": 0.6997569799423218, + "num_tokens": 197101059.0, + "step": 7781 + }, + { + "epoch": 0.8546013617395124, + "grad_norm": 2.0107197761535645, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.6992465853691101, + "num_tokens": 197128269.0, + "step": 7782 + }, + { + "epoch": 0.854711179442126, + "grad_norm": 2.011611223220825, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7139711976051331, + "num_tokens": 197155541.0, + "step": 7783 + }, + { + "epoch": 0.8548209971447397, + "grad_norm": 2.042637348175049, + "learning_rate": 1e-06, + "loss": 1.0022, + "mean_token_accuracy": 0.6914874315261841, + "num_tokens": 197183436.0, + "step": 7784 + }, + { + "epoch": 0.8549308148473534, + "grad_norm": 2.3401520252227783, + "learning_rate": 1e-06, + "loss": 0.8911, + "mean_token_accuracy": 0.7220734357833862, + "num_tokens": 197204116.0, + "step": 7785 + }, + { + "epoch": 0.8550406325499671, + "grad_norm": 2.485166311264038, + "learning_rate": 1e-06, + "loss": 0.9586, + "mean_token_accuracy": 0.7013700604438782, + "num_tokens": 197225521.0, + "step": 7786 + }, + { + "epoch": 0.8551504502525807, + "grad_norm": 2.108638286590576, + "learning_rate": 1e-06, + "loss": 1.0077, + "mean_token_accuracy": 0.7059087753295898, + "num_tokens": 197251676.0, + "step": 7787 + }, + { + "epoch": 0.8552602679551944, + "grad_norm": 2.0550782680511475, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7086977362632751, + "num_tokens": 197280775.0, + "step": 7788 + }, + { + "epoch": 0.855370085657808, + "grad_norm": 2.215613603591919, + "learning_rate": 1e-06, + "loss": 0.9444, + "mean_token_accuracy": 0.7150193452835083, + "num_tokens": 197305068.0, + "step": 7789 + }, + { + "epoch": 0.8554799033604217, + "grad_norm": 2.102794885635376, + "learning_rate": 1e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.7088459730148315, + "num_tokens": 197330656.0, + "step": 7790 + }, + { + "epoch": 0.8555897210630353, + "grad_norm": 2.115097761154175, + "learning_rate": 1e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7173258066177368, + "num_tokens": 197354345.0, + "step": 7791 + }, + { + "epoch": 0.855699538765649, + "grad_norm": 2.3314294815063477, + "learning_rate": 1e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.721102237701416, + "num_tokens": 197376372.0, + "step": 7792 + }, + { + "epoch": 0.8558093564682627, + "grad_norm": 2.1144003868103027, + "learning_rate": 1e-06, + "loss": 0.9824, + "mean_token_accuracy": 0.708072304725647, + "num_tokens": 197403033.0, + "step": 7793 + }, + { + "epoch": 0.8559191741708764, + "grad_norm": 2.183403253555298, + "learning_rate": 1e-06, + "loss": 1.0347, + "mean_token_accuracy": 0.6909931898117065, + "num_tokens": 197426707.0, + "step": 7794 + }, + { + "epoch": 0.85602899187349, + "grad_norm": 1.8926457166671753, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.7090153694152832, + "num_tokens": 197459249.0, + "step": 7795 + }, + { + "epoch": 0.8561388095761037, + "grad_norm": 2.4130146503448486, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.6998793482780457, + "num_tokens": 197482584.0, + "step": 7796 + }, + { + "epoch": 0.8562486272787173, + "grad_norm": 2.682184934616089, + "learning_rate": 1e-06, + "loss": 0.9871, + "mean_token_accuracy": 0.6998339295387268, + "num_tokens": 197501478.0, + "step": 7797 + }, + { + "epoch": 0.856358444981331, + "grad_norm": 2.4861605167388916, + "learning_rate": 1e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.7240136861801147, + "num_tokens": 197521108.0, + "step": 7798 + }, + { + "epoch": 0.8564682626839446, + "grad_norm": 2.7125930786132812, + "learning_rate": 1e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.721335768699646, + "num_tokens": 197539444.0, + "step": 7799 + }, + { + "epoch": 0.8565780803865584, + "grad_norm": 2.5457820892333984, + "learning_rate": 1e-06, + "loss": 0.8537, + "mean_token_accuracy": 0.729924201965332, + "num_tokens": 197556881.0, + "step": 7800 + }, + { + "epoch": 0.856687898089172, + "grad_norm": 2.3826491832733154, + "learning_rate": 1e-06, + "loss": 0.8788, + "mean_token_accuracy": 0.7239273190498352, + "num_tokens": 197579263.0, + "step": 7801 + }, + { + "epoch": 0.8567977157917857, + "grad_norm": 2.0725438594818115, + "learning_rate": 1e-06, + "loss": 1.0531, + "mean_token_accuracy": 0.6808198094367981, + "num_tokens": 197607709.0, + "step": 7802 + }, + { + "epoch": 0.8569075334943993, + "grad_norm": 2.372081756591797, + "learning_rate": 1e-06, + "loss": 0.9506, + "mean_token_accuracy": 0.7099189758300781, + "num_tokens": 197630010.0, + "step": 7803 + }, + { + "epoch": 0.8570173511970129, + "grad_norm": 2.186793565750122, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7160093188285828, + "num_tokens": 197656676.0, + "step": 7804 + }, + { + "epoch": 0.8571271688996266, + "grad_norm": 2.2159371376037598, + "learning_rate": 1e-06, + "loss": 0.9969, + "mean_token_accuracy": 0.6905322670936584, + "num_tokens": 197681011.0, + "step": 7805 + }, + { + "epoch": 0.8572369866022402, + "grad_norm": 2.0200016498565674, + "learning_rate": 1e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.7040741443634033, + "num_tokens": 197709634.0, + "step": 7806 + }, + { + "epoch": 0.857346804304854, + "grad_norm": 2.121143102645874, + "learning_rate": 1e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.7003718614578247, + "num_tokens": 197735550.0, + "step": 7807 + }, + { + "epoch": 0.8574566220074676, + "grad_norm": 2.153956651687622, + "learning_rate": 1e-06, + "loss": 0.9568, + "mean_token_accuracy": 0.7036924362182617, + "num_tokens": 197762987.0, + "step": 7808 + }, + { + "epoch": 0.8575664397100813, + "grad_norm": 1.9590656757354736, + "learning_rate": 1e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.6952985525131226, + "num_tokens": 197793520.0, + "step": 7809 + }, + { + "epoch": 0.8576762574126949, + "grad_norm": 2.1186420917510986, + "learning_rate": 1e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.7237982749938965, + "num_tokens": 197819975.0, + "step": 7810 + }, + { + "epoch": 0.8577860751153086, + "grad_norm": 2.4577107429504395, + "learning_rate": 1e-06, + "loss": 0.9375, + "mean_token_accuracy": 0.7159227132797241, + "num_tokens": 197841554.0, + "step": 7811 + }, + { + "epoch": 0.8578958928179222, + "grad_norm": 2.2584667205810547, + "learning_rate": 1e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7110339403152466, + "num_tokens": 197865449.0, + "step": 7812 + }, + { + "epoch": 0.8580057105205359, + "grad_norm": 2.2757441997528076, + "learning_rate": 1e-06, + "loss": 0.9022, + "mean_token_accuracy": 0.7235292792320251, + "num_tokens": 197888280.0, + "step": 7813 + }, + { + "epoch": 0.8581155282231496, + "grad_norm": 2.236992359161377, + "learning_rate": 1e-06, + "loss": 1.043, + "mean_token_accuracy": 0.6873199939727783, + "num_tokens": 197916661.0, + "step": 7814 + }, + { + "epoch": 0.8582253459257633, + "grad_norm": 2.266622304916382, + "learning_rate": 1e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.6975228786468506, + "num_tokens": 197939932.0, + "step": 7815 + }, + { + "epoch": 0.8583351636283769, + "grad_norm": 2.0399296283721924, + "learning_rate": 1e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.6984501481056213, + "num_tokens": 197970854.0, + "step": 7816 + }, + { + "epoch": 0.8584449813309906, + "grad_norm": 2.140434980392456, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.6998677253723145, + "num_tokens": 197995998.0, + "step": 7817 + }, + { + "epoch": 0.8585547990336042, + "grad_norm": 2.107698678970337, + "learning_rate": 1e-06, + "loss": 0.8157, + "mean_token_accuracy": 0.7455552816390991, + "num_tokens": 198020867.0, + "step": 7818 + }, + { + "epoch": 0.8586646167362179, + "grad_norm": 2.20629620552063, + "learning_rate": 1e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.7136198282241821, + "num_tokens": 198046964.0, + "step": 7819 + }, + { + "epoch": 0.8587744344388315, + "grad_norm": 2.1487722396850586, + "learning_rate": 1e-06, + "loss": 0.9224, + "mean_token_accuracy": 0.7160765528678894, + "num_tokens": 198073799.0, + "step": 7820 + }, + { + "epoch": 0.8588842521414451, + "grad_norm": 2.343341112136841, + "learning_rate": 1e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.7027066946029663, + "num_tokens": 198096236.0, + "step": 7821 + }, + { + "epoch": 0.8589940698440589, + "grad_norm": 2.114525318145752, + "learning_rate": 1e-06, + "loss": 1.0176, + "mean_token_accuracy": 0.685230553150177, + "num_tokens": 198123103.0, + "step": 7822 + }, + { + "epoch": 0.8591038875466726, + "grad_norm": 2.1740639209747314, + "learning_rate": 1e-06, + "loss": 0.9041, + "mean_token_accuracy": 0.7184348106384277, + "num_tokens": 198148116.0, + "step": 7823 + }, + { + "epoch": 0.8592137052492862, + "grad_norm": 2.2640416622161865, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.7008779048919678, + "num_tokens": 198171029.0, + "step": 7824 + }, + { + "epoch": 0.8593235229518998, + "grad_norm": 2.1350491046905518, + "learning_rate": 1e-06, + "loss": 0.989, + "mean_token_accuracy": 0.7056258916854858, + "num_tokens": 198196397.0, + "step": 7825 + }, + { + "epoch": 0.8594333406545135, + "grad_norm": 2.3330962657928467, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.703342080116272, + "num_tokens": 198220248.0, + "step": 7826 + }, + { + "epoch": 0.8595431583571271, + "grad_norm": 1.9856425523757935, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7157787084579468, + "num_tokens": 198247631.0, + "step": 7827 + }, + { + "epoch": 0.8596529760597408, + "grad_norm": 2.0656027793884277, + "learning_rate": 1e-06, + "loss": 0.9268, + "mean_token_accuracy": 0.7095688581466675, + "num_tokens": 198273699.0, + "step": 7828 + }, + { + "epoch": 0.8597627937623545, + "grad_norm": 2.3456177711486816, + "learning_rate": 1e-06, + "loss": 0.8202, + "mean_token_accuracy": 0.7478797435760498, + "num_tokens": 198294067.0, + "step": 7829 + }, + { + "epoch": 0.8598726114649682, + "grad_norm": 2.2745771408081055, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.7050703763961792, + "num_tokens": 198318264.0, + "step": 7830 + }, + { + "epoch": 0.8599824291675818, + "grad_norm": 2.3837649822235107, + "learning_rate": 1e-06, + "loss": 0.8903, + "mean_token_accuracy": 0.7247308492660522, + "num_tokens": 198343475.0, + "step": 7831 + }, + { + "epoch": 0.8600922468701955, + "grad_norm": 2.0455470085144043, + "learning_rate": 1e-06, + "loss": 1.0141, + "mean_token_accuracy": 0.691791296005249, + "num_tokens": 198373408.0, + "step": 7832 + }, + { + "epoch": 0.8602020645728091, + "grad_norm": 2.101870059967041, + "learning_rate": 1e-06, + "loss": 1.0362, + "mean_token_accuracy": 0.6914601922035217, + "num_tokens": 198400456.0, + "step": 7833 + }, + { + "epoch": 0.8603118822754228, + "grad_norm": 2.2703142166137695, + "learning_rate": 1e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.7065287232398987, + "num_tokens": 198423908.0, + "step": 7834 + }, + { + "epoch": 0.8604216999780364, + "grad_norm": 2.2223546504974365, + "learning_rate": 1e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.7003331184387207, + "num_tokens": 198447538.0, + "step": 7835 + }, + { + "epoch": 0.8605315176806502, + "grad_norm": 2.1308162212371826, + "learning_rate": 1e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.7070791721343994, + "num_tokens": 198473614.0, + "step": 7836 + }, + { + "epoch": 0.8606413353832638, + "grad_norm": 2.168522834777832, + "learning_rate": 1e-06, + "loss": 0.9869, + "mean_token_accuracy": 0.7034417390823364, + "num_tokens": 198499262.0, + "step": 7837 + }, + { + "epoch": 0.8607511530858775, + "grad_norm": 2.2204136848449707, + "learning_rate": 1e-06, + "loss": 1.0696, + "mean_token_accuracy": 0.6761820316314697, + "num_tokens": 198527715.0, + "step": 7838 + }, + { + "epoch": 0.8608609707884911, + "grad_norm": 1.9520081281661987, + "learning_rate": 1e-06, + "loss": 0.964, + "mean_token_accuracy": 0.7022857666015625, + "num_tokens": 198558212.0, + "step": 7839 + }, + { + "epoch": 0.8609707884911048, + "grad_norm": 1.9269073009490967, + "learning_rate": 1e-06, + "loss": 1.0624, + "mean_token_accuracy": 0.6781650185585022, + "num_tokens": 198590802.0, + "step": 7840 + }, + { + "epoch": 0.8610806061937184, + "grad_norm": 2.058150053024292, + "learning_rate": 1e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.6953040957450867, + "num_tokens": 198619425.0, + "step": 7841 + }, + { + "epoch": 0.861190423896332, + "grad_norm": 2.14577054977417, + "learning_rate": 1e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.710302472114563, + "num_tokens": 198646922.0, + "step": 7842 + }, + { + "epoch": 0.8613002415989458, + "grad_norm": 2.359330415725708, + "learning_rate": 1e-06, + "loss": 0.8512, + "mean_token_accuracy": 0.7403473854064941, + "num_tokens": 198667743.0, + "step": 7843 + }, + { + "epoch": 0.8614100593015594, + "grad_norm": 1.9940979480743408, + "learning_rate": 1e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.7014141082763672, + "num_tokens": 198695957.0, + "step": 7844 + }, + { + "epoch": 0.8615198770041731, + "grad_norm": 2.307792901992798, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7128589749336243, + "num_tokens": 198718052.0, + "step": 7845 + }, + { + "epoch": 0.8616296947067867, + "grad_norm": 2.3974785804748535, + "learning_rate": 1e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7201818227767944, + "num_tokens": 198739379.0, + "step": 7846 + }, + { + "epoch": 0.8617395124094004, + "grad_norm": 2.6408944129943848, + "learning_rate": 1e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7159136533737183, + "num_tokens": 198758238.0, + "step": 7847 + }, + { + "epoch": 0.861849330112014, + "grad_norm": 2.144475221633911, + "learning_rate": 1e-06, + "loss": 0.9923, + "mean_token_accuracy": 0.6993259787559509, + "num_tokens": 198785613.0, + "step": 7848 + }, + { + "epoch": 0.8619591478146277, + "grad_norm": 2.3973548412323, + "learning_rate": 1e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7082275152206421, + "num_tokens": 198806876.0, + "step": 7849 + }, + { + "epoch": 0.8620689655172413, + "grad_norm": 1.948492407798767, + "learning_rate": 1e-06, + "loss": 1.0096, + "mean_token_accuracy": 0.699652373790741, + "num_tokens": 198838149.0, + "step": 7850 + }, + { + "epoch": 0.8621787832198551, + "grad_norm": 1.9139626026153564, + "learning_rate": 1e-06, + "loss": 0.9931, + "mean_token_accuracy": 0.6982357501983643, + "num_tokens": 198869062.0, + "step": 7851 + }, + { + "epoch": 0.8622886009224687, + "grad_norm": 2.3465259075164795, + "learning_rate": 1e-06, + "loss": 0.8769, + "mean_token_accuracy": 0.724528431892395, + "num_tokens": 198891621.0, + "step": 7852 + }, + { + "epoch": 0.8623984186250824, + "grad_norm": 2.3545613288879395, + "learning_rate": 1e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.7154672741889954, + "num_tokens": 198913575.0, + "step": 7853 + }, + { + "epoch": 0.862508236327696, + "grad_norm": 2.2213189601898193, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7047339677810669, + "num_tokens": 198938653.0, + "step": 7854 + }, + { + "epoch": 0.8626180540303097, + "grad_norm": 2.6809537410736084, + "learning_rate": 1e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.720991849899292, + "num_tokens": 198956397.0, + "step": 7855 + }, + { + "epoch": 0.8627278717329233, + "grad_norm": 2.3190226554870605, + "learning_rate": 1e-06, + "loss": 0.8777, + "mean_token_accuracy": 0.713585615158081, + "num_tokens": 198978899.0, + "step": 7856 + }, + { + "epoch": 0.862837689435537, + "grad_norm": 2.1466455459594727, + "learning_rate": 1e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.7102984189987183, + "num_tokens": 199004962.0, + "step": 7857 + }, + { + "epoch": 0.8629475071381507, + "grad_norm": 2.205709218978882, + "learning_rate": 1e-06, + "loss": 0.922, + "mean_token_accuracy": 0.7098404169082642, + "num_tokens": 199028718.0, + "step": 7858 + }, + { + "epoch": 0.8630573248407644, + "grad_norm": 2.5453176498413086, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7107731103897095, + "num_tokens": 199050846.0, + "step": 7859 + }, + { + "epoch": 0.863167142543378, + "grad_norm": 1.9594380855560303, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.7159746885299683, + "num_tokens": 199078319.0, + "step": 7860 + }, + { + "epoch": 0.8632769602459917, + "grad_norm": 2.4661922454833984, + "learning_rate": 1e-06, + "loss": 0.8518, + "mean_token_accuracy": 0.7281281352043152, + "num_tokens": 199098817.0, + "step": 7861 + }, + { + "epoch": 0.8633867779486053, + "grad_norm": 2.5610530376434326, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.7019134163856506, + "num_tokens": 199119714.0, + "step": 7862 + }, + { + "epoch": 0.863496595651219, + "grad_norm": 2.389068603515625, + "learning_rate": 1e-06, + "loss": 0.8847, + "mean_token_accuracy": 0.7243932485580444, + "num_tokens": 199141997.0, + "step": 7863 + }, + { + "epoch": 0.8636064133538326, + "grad_norm": 2.3128461837768555, + "learning_rate": 1e-06, + "loss": 0.8964, + "mean_token_accuracy": 0.7289432287216187, + "num_tokens": 199163830.0, + "step": 7864 + }, + { + "epoch": 0.8637162310564463, + "grad_norm": 2.24055552482605, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.6950646638870239, + "num_tokens": 199188643.0, + "step": 7865 + }, + { + "epoch": 0.86382604875906, + "grad_norm": 2.2485923767089844, + "learning_rate": 1e-06, + "loss": 0.98, + "mean_token_accuracy": 0.7046961784362793, + "num_tokens": 199211674.0, + "step": 7866 + }, + { + "epoch": 0.8639358664616736, + "grad_norm": 2.2516887187957764, + "learning_rate": 1e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.718040406703949, + "num_tokens": 199235449.0, + "step": 7867 + }, + { + "epoch": 0.8640456841642873, + "grad_norm": 2.7461676597595215, + "learning_rate": 1e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.6911531090736389, + "num_tokens": 199252561.0, + "step": 7868 + }, + { + "epoch": 0.8641555018669009, + "grad_norm": 2.2549331188201904, + "learning_rate": 1e-06, + "loss": 1.0024, + "mean_token_accuracy": 0.6924159526824951, + "num_tokens": 199279173.0, + "step": 7869 + }, + { + "epoch": 0.8642653195695146, + "grad_norm": 2.1562485694885254, + "learning_rate": 1e-06, + "loss": 1.0031, + "mean_token_accuracy": 0.6902797818183899, + "num_tokens": 199305783.0, + "step": 7870 + }, + { + "epoch": 0.8643751372721282, + "grad_norm": 2.113633871078491, + "learning_rate": 1e-06, + "loss": 0.8717, + "mean_token_accuracy": 0.7322536706924438, + "num_tokens": 199330987.0, + "step": 7871 + }, + { + "epoch": 0.864484954974742, + "grad_norm": 2.178018808364868, + "learning_rate": 1e-06, + "loss": 0.7713, + "mean_token_accuracy": 0.7463642954826355, + "num_tokens": 199353740.0, + "step": 7872 + }, + { + "epoch": 0.8645947726773556, + "grad_norm": 2.0648107528686523, + "learning_rate": 1e-06, + "loss": 1.0338, + "mean_token_accuracy": 0.6915141344070435, + "num_tokens": 199380755.0, + "step": 7873 + }, + { + "epoch": 0.8647045903799693, + "grad_norm": 2.0501956939697266, + "learning_rate": 1e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.7211717367172241, + "num_tokens": 199406590.0, + "step": 7874 + }, + { + "epoch": 0.8648144080825829, + "grad_norm": 2.46201753616333, + "learning_rate": 1e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.7008452415466309, + "num_tokens": 199426068.0, + "step": 7875 + }, + { + "epoch": 0.8649242257851966, + "grad_norm": 2.2803194522857666, + "learning_rate": 1e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7133970856666565, + "num_tokens": 199449302.0, + "step": 7876 + }, + { + "epoch": 0.8650340434878102, + "grad_norm": 2.15191388130188, + "learning_rate": 1e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7247353196144104, + "num_tokens": 199474212.0, + "step": 7877 + }, + { + "epoch": 0.8651438611904239, + "grad_norm": 2.3157825469970703, + "learning_rate": 1e-06, + "loss": 0.849, + "mean_token_accuracy": 0.7298224568367004, + "num_tokens": 199496022.0, + "step": 7878 + }, + { + "epoch": 0.8652536788930375, + "grad_norm": 2.2362663745880127, + "learning_rate": 1e-06, + "loss": 1.029, + "mean_token_accuracy": 0.6860178112983704, + "num_tokens": 199522444.0, + "step": 7879 + }, + { + "epoch": 0.8653634965956513, + "grad_norm": 2.1562726497650146, + "learning_rate": 1e-06, + "loss": 1.0253, + "mean_token_accuracy": 0.6847935914993286, + "num_tokens": 199550183.0, + "step": 7880 + }, + { + "epoch": 0.8654733142982649, + "grad_norm": 2.2848591804504395, + "learning_rate": 1e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.7081657648086548, + "num_tokens": 199574534.0, + "step": 7881 + }, + { + "epoch": 0.8655831320008786, + "grad_norm": 2.0245606899261475, + "learning_rate": 1e-06, + "loss": 0.9738, + "mean_token_accuracy": 0.703316867351532, + "num_tokens": 199602233.0, + "step": 7882 + }, + { + "epoch": 0.8656929497034922, + "grad_norm": 2.14823579788208, + "learning_rate": 1e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.6997425556182861, + "num_tokens": 199628588.0, + "step": 7883 + }, + { + "epoch": 0.8658027674061058, + "grad_norm": 2.4863216876983643, + "learning_rate": 1e-06, + "loss": 1.0374, + "mean_token_accuracy": 0.6840380430221558, + "num_tokens": 199649723.0, + "step": 7884 + }, + { + "epoch": 0.8659125851087195, + "grad_norm": 1.8639270067214966, + "learning_rate": 1e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.7074955105781555, + "num_tokens": 199681681.0, + "step": 7885 + }, + { + "epoch": 0.8660224028113331, + "grad_norm": 2.0479660034179688, + "learning_rate": 1e-06, + "loss": 0.9919, + "mean_token_accuracy": 0.6966922283172607, + "num_tokens": 199710508.0, + "step": 7886 + }, + { + "epoch": 0.8661322205139469, + "grad_norm": 2.1009016036987305, + "learning_rate": 1e-06, + "loss": 1.0467, + "mean_token_accuracy": 0.6821619272232056, + "num_tokens": 199740487.0, + "step": 7887 + }, + { + "epoch": 0.8662420382165605, + "grad_norm": 1.991034746170044, + "learning_rate": 1e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.6981903314590454, + "num_tokens": 199772701.0, + "step": 7888 + }, + { + "epoch": 0.8663518559191742, + "grad_norm": 1.9641938209533691, + "learning_rate": 1e-06, + "loss": 1.087, + "mean_token_accuracy": 0.678678035736084, + "num_tokens": 199803188.0, + "step": 7889 + }, + { + "epoch": 0.8664616736217878, + "grad_norm": 2.270968198776245, + "learning_rate": 1e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7277788519859314, + "num_tokens": 199826027.0, + "step": 7890 + }, + { + "epoch": 0.8665714913244015, + "grad_norm": 2.25919508934021, + "learning_rate": 1e-06, + "loss": 1.0037, + "mean_token_accuracy": 0.693571150302887, + "num_tokens": 199851015.0, + "step": 7891 + }, + { + "epoch": 0.8666813090270151, + "grad_norm": 1.9661251306533813, + "learning_rate": 1e-06, + "loss": 0.8668, + "mean_token_accuracy": 0.7295647859573364, + "num_tokens": 199878970.0, + "step": 7892 + }, + { + "epoch": 0.8667911267296288, + "grad_norm": 2.2033798694610596, + "learning_rate": 1e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.7112980484962463, + "num_tokens": 199902000.0, + "step": 7893 + }, + { + "epoch": 0.8669009444322425, + "grad_norm": 2.0346193313598633, + "learning_rate": 1e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.7035149335861206, + "num_tokens": 199932130.0, + "step": 7894 + }, + { + "epoch": 0.8670107621348562, + "grad_norm": 2.2699527740478516, + "learning_rate": 1e-06, + "loss": 0.8936, + "mean_token_accuracy": 0.7233436703681946, + "num_tokens": 199956340.0, + "step": 7895 + }, + { + "epoch": 0.8671205798374698, + "grad_norm": 2.2660739421844482, + "learning_rate": 1e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.6933711767196655, + "num_tokens": 199980969.0, + "step": 7896 + }, + { + "epoch": 0.8672303975400835, + "grad_norm": 1.9545055627822876, + "learning_rate": 1e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.6937264800071716, + "num_tokens": 200011424.0, + "step": 7897 + }, + { + "epoch": 0.8673402152426971, + "grad_norm": 1.9707653522491455, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.6967519521713257, + "num_tokens": 200045602.0, + "step": 7898 + }, + { + "epoch": 0.8674500329453108, + "grad_norm": 2.267749786376953, + "learning_rate": 1e-06, + "loss": 1.0372, + "mean_token_accuracy": 0.6817058324813843, + "num_tokens": 200070450.0, + "step": 7899 + }, + { + "epoch": 0.8675598506479244, + "grad_norm": 2.1884844303131104, + "learning_rate": 1e-06, + "loss": 0.903, + "mean_token_accuracy": 0.7183062434196472, + "num_tokens": 200094809.0, + "step": 7900 + }, + { + "epoch": 0.8676696683505382, + "grad_norm": 2.2662432193756104, + "learning_rate": 1e-06, + "loss": 0.898, + "mean_token_accuracy": 0.7223289012908936, + "num_tokens": 200119133.0, + "step": 7901 + }, + { + "epoch": 0.8677794860531518, + "grad_norm": 2.3684279918670654, + "learning_rate": 1e-06, + "loss": 1.0306, + "mean_token_accuracy": 0.701085090637207, + "num_tokens": 200142285.0, + "step": 7902 + }, + { + "epoch": 0.8678893037557655, + "grad_norm": 2.2657864093780518, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7232348918914795, + "num_tokens": 200165719.0, + "step": 7903 + }, + { + "epoch": 0.8679991214583791, + "grad_norm": 1.9447675943374634, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.6948612332344055, + "num_tokens": 200197255.0, + "step": 7904 + }, + { + "epoch": 0.8681089391609927, + "grad_norm": 2.335843563079834, + "learning_rate": 1e-06, + "loss": 1.0062, + "mean_token_accuracy": 0.69389808177948, + "num_tokens": 200221087.0, + "step": 7905 + }, + { + "epoch": 0.8682187568636064, + "grad_norm": 2.37939715385437, + "learning_rate": 1e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.6994442343711853, + "num_tokens": 200243535.0, + "step": 7906 + }, + { + "epoch": 0.86832857456622, + "grad_norm": 2.0938003063201904, + "learning_rate": 1e-06, + "loss": 0.9942, + "mean_token_accuracy": 0.7022485733032227, + "num_tokens": 200271049.0, + "step": 7907 + }, + { + "epoch": 0.8684383922688337, + "grad_norm": 1.8552178144454956, + "learning_rate": 1e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.7113479375839233, + "num_tokens": 200303142.0, + "step": 7908 + }, + { + "epoch": 0.8685482099714474, + "grad_norm": 2.1548140048980713, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.6932934522628784, + "num_tokens": 200329589.0, + "step": 7909 + }, + { + "epoch": 0.8686580276740611, + "grad_norm": 2.6803243160247803, + "learning_rate": 1e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.7051568031311035, + "num_tokens": 200350136.0, + "step": 7910 + }, + { + "epoch": 0.8687678453766747, + "grad_norm": 2.2037851810455322, + "learning_rate": 1e-06, + "loss": 0.904, + "mean_token_accuracy": 0.7163358330726624, + "num_tokens": 200374444.0, + "step": 7911 + }, + { + "epoch": 0.8688776630792884, + "grad_norm": 2.4230594635009766, + "learning_rate": 1e-06, + "loss": 0.9215, + "mean_token_accuracy": 0.7110958099365234, + "num_tokens": 200395073.0, + "step": 7912 + }, + { + "epoch": 0.868987480781902, + "grad_norm": 2.038600444793701, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.710780680179596, + "num_tokens": 200423813.0, + "step": 7913 + }, + { + "epoch": 0.8690972984845157, + "grad_norm": 2.262700080871582, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.705701470375061, + "num_tokens": 200448599.0, + "step": 7914 + }, + { + "epoch": 0.8692071161871293, + "grad_norm": 2.2724826335906982, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7249619364738464, + "num_tokens": 200473270.0, + "step": 7915 + }, + { + "epoch": 0.8693169338897431, + "grad_norm": 1.9651471376419067, + "learning_rate": 1e-06, + "loss": 1.0754, + "mean_token_accuracy": 0.6749653816223145, + "num_tokens": 200504146.0, + "step": 7916 + }, + { + "epoch": 0.8694267515923567, + "grad_norm": 2.425741195678711, + "learning_rate": 1e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.7015948295593262, + "num_tokens": 200529116.0, + "step": 7917 + }, + { + "epoch": 0.8695365692949704, + "grad_norm": 1.959660291671753, + "learning_rate": 1e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.6972804069519043, + "num_tokens": 200560267.0, + "step": 7918 + }, + { + "epoch": 0.869646386997584, + "grad_norm": 2.123175621032715, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.7039788961410522, + "num_tokens": 200588097.0, + "step": 7919 + }, + { + "epoch": 0.8697562047001977, + "grad_norm": 2.1718673706054688, + "learning_rate": 1e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.6939213275909424, + "num_tokens": 200614663.0, + "step": 7920 + }, + { + "epoch": 0.8698660224028113, + "grad_norm": 2.066241502761841, + "learning_rate": 1e-06, + "loss": 0.8993, + "mean_token_accuracy": 0.7276778221130371, + "num_tokens": 200641137.0, + "step": 7921 + }, + { + "epoch": 0.869975840105425, + "grad_norm": 2.1474075317382812, + "learning_rate": 1e-06, + "loss": 1.0101, + "mean_token_accuracy": 0.7086225748062134, + "num_tokens": 200666795.0, + "step": 7922 + }, + { + "epoch": 0.8700856578080387, + "grad_norm": 2.2479162216186523, + "learning_rate": 1e-06, + "loss": 0.941, + "mean_token_accuracy": 0.7116670608520508, + "num_tokens": 200690020.0, + "step": 7923 + }, + { + "epoch": 0.8701954755106523, + "grad_norm": 2.490135431289673, + "learning_rate": 1e-06, + "loss": 0.8519, + "mean_token_accuracy": 0.7367340922355652, + "num_tokens": 200710046.0, + "step": 7924 + }, + { + "epoch": 0.870305293213266, + "grad_norm": 2.1631994247436523, + "learning_rate": 1e-06, + "loss": 1.0497, + "mean_token_accuracy": 0.688204824924469, + "num_tokens": 200737166.0, + "step": 7925 + }, + { + "epoch": 0.8704151109158796, + "grad_norm": 2.3527750968933105, + "learning_rate": 1e-06, + "loss": 0.8919, + "mean_token_accuracy": 0.7259202003479004, + "num_tokens": 200760753.0, + "step": 7926 + }, + { + "epoch": 0.8705249286184933, + "grad_norm": 2.3810319900512695, + "learning_rate": 1e-06, + "loss": 1.0155, + "mean_token_accuracy": 0.7027348279953003, + "num_tokens": 200784873.0, + "step": 7927 + }, + { + "epoch": 0.8706347463211069, + "grad_norm": 1.99115788936615, + "learning_rate": 1e-06, + "loss": 1.007, + "mean_token_accuracy": 0.6962546110153198, + "num_tokens": 200814339.0, + "step": 7928 + }, + { + "epoch": 0.8707445640237206, + "grad_norm": 2.3300912380218506, + "learning_rate": 1e-06, + "loss": 0.923, + "mean_token_accuracy": 0.7057203054428101, + "num_tokens": 200835782.0, + "step": 7929 + }, + { + "epoch": 0.8708543817263343, + "grad_norm": 2.34964919090271, + "learning_rate": 1e-06, + "loss": 0.9048, + "mean_token_accuracy": 0.7218145132064819, + "num_tokens": 200858816.0, + "step": 7930 + }, + { + "epoch": 0.870964199428948, + "grad_norm": 2.0684001445770264, + "learning_rate": 1e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.7125059366226196, + "num_tokens": 200886210.0, + "step": 7931 + }, + { + "epoch": 0.8710740171315616, + "grad_norm": 2.4702560901641846, + "learning_rate": 1e-06, + "loss": 0.9043, + "mean_token_accuracy": 0.7228457927703857, + "num_tokens": 200906226.0, + "step": 7932 + }, + { + "epoch": 0.8711838348341753, + "grad_norm": 1.9951516389846802, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7094911932945251, + "num_tokens": 200939046.0, + "step": 7933 + }, + { + "epoch": 0.8712936525367889, + "grad_norm": 2.4937472343444824, + "learning_rate": 1e-06, + "loss": 0.8737, + "mean_token_accuracy": 0.7302095890045166, + "num_tokens": 200959999.0, + "step": 7934 + }, + { + "epoch": 0.8714034702394026, + "grad_norm": 2.1564395427703857, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7238929867744446, + "num_tokens": 200987079.0, + "step": 7935 + }, + { + "epoch": 0.8715132879420162, + "grad_norm": 2.06264328956604, + "learning_rate": 1e-06, + "loss": 1.0113, + "mean_token_accuracy": 0.6977163553237915, + "num_tokens": 201017190.0, + "step": 7936 + }, + { + "epoch": 0.87162310564463, + "grad_norm": 2.358096122741699, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7152212858200073, + "num_tokens": 201039310.0, + "step": 7937 + }, + { + "epoch": 0.8717329233472436, + "grad_norm": 2.6012063026428223, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7163223624229431, + "num_tokens": 201058284.0, + "step": 7938 + }, + { + "epoch": 0.8718427410498573, + "grad_norm": 2.0650973320007324, + "learning_rate": 1e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.7162774801254272, + "num_tokens": 201086481.0, + "step": 7939 + }, + { + "epoch": 0.8719525587524709, + "grad_norm": 2.117647409439087, + "learning_rate": 1e-06, + "loss": 0.932, + "mean_token_accuracy": 0.723160982131958, + "num_tokens": 201111860.0, + "step": 7940 + }, + { + "epoch": 0.8720623764550846, + "grad_norm": 2.181774616241455, + "learning_rate": 1e-06, + "loss": 1.0037, + "mean_token_accuracy": 0.7012336254119873, + "num_tokens": 201138333.0, + "step": 7941 + }, + { + "epoch": 0.8721721941576982, + "grad_norm": 2.29496169090271, + "learning_rate": 1e-06, + "loss": 0.9995, + "mean_token_accuracy": 0.6960844993591309, + "num_tokens": 201163612.0, + "step": 7942 + }, + { + "epoch": 0.8722820118603118, + "grad_norm": 2.322844982147217, + "learning_rate": 1e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7097642421722412, + "num_tokens": 201184642.0, + "step": 7943 + }, + { + "epoch": 0.8723918295629255, + "grad_norm": 2.061593532562256, + "learning_rate": 1e-06, + "loss": 0.8821, + "mean_token_accuracy": 0.7224242687225342, + "num_tokens": 201209901.0, + "step": 7944 + }, + { + "epoch": 0.8725016472655392, + "grad_norm": 2.0664703845977783, + "learning_rate": 1e-06, + "loss": 0.9843, + "mean_token_accuracy": 0.697837233543396, + "num_tokens": 201238751.0, + "step": 7945 + }, + { + "epoch": 0.8726114649681529, + "grad_norm": 2.0476620197296143, + "learning_rate": 1e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.6981469392776489, + "num_tokens": 201268447.0, + "step": 7946 + }, + { + "epoch": 0.8727212826707665, + "grad_norm": 2.115701913833618, + "learning_rate": 1e-06, + "loss": 1.0218, + "mean_token_accuracy": 0.6862268447875977, + "num_tokens": 201296592.0, + "step": 7947 + }, + { + "epoch": 0.8728311003733802, + "grad_norm": 2.3233063220977783, + "learning_rate": 1e-06, + "loss": 0.9084, + "mean_token_accuracy": 0.7171944379806519, + "num_tokens": 201318601.0, + "step": 7948 + }, + { + "epoch": 0.8729409180759938, + "grad_norm": 2.158781051635742, + "learning_rate": 1e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.7132512331008911, + "num_tokens": 201344958.0, + "step": 7949 + }, + { + "epoch": 0.8730507357786075, + "grad_norm": 2.114363670349121, + "learning_rate": 1e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.6989526748657227, + "num_tokens": 201374006.0, + "step": 7950 + }, + { + "epoch": 0.8731605534812211, + "grad_norm": 2.4002556800842285, + "learning_rate": 1e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.7147602438926697, + "num_tokens": 201395141.0, + "step": 7951 + }, + { + "epoch": 0.8732703711838349, + "grad_norm": 1.9773313999176025, + "learning_rate": 1e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.7043117880821228, + "num_tokens": 201424716.0, + "step": 7952 + }, + { + "epoch": 0.8733801888864485, + "grad_norm": 1.9898658990859985, + "learning_rate": 1e-06, + "loss": 0.981, + "mean_token_accuracy": 0.6990832090377808, + "num_tokens": 201455935.0, + "step": 7953 + }, + { + "epoch": 0.8734900065890622, + "grad_norm": 2.2851929664611816, + "learning_rate": 1e-06, + "loss": 1.0773, + "mean_token_accuracy": 0.6921471953392029, + "num_tokens": 201480377.0, + "step": 7954 + }, + { + "epoch": 0.8735998242916758, + "grad_norm": 2.2028563022613525, + "learning_rate": 1e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.7006226778030396, + "num_tokens": 201505272.0, + "step": 7955 + }, + { + "epoch": 0.8737096419942895, + "grad_norm": 2.0322089195251465, + "learning_rate": 1e-06, + "loss": 1.0512, + "mean_token_accuracy": 0.6845261454582214, + "num_tokens": 201536002.0, + "step": 7956 + }, + { + "epoch": 0.8738194596969031, + "grad_norm": 2.355780601501465, + "learning_rate": 1e-06, + "loss": 0.9915, + "mean_token_accuracy": 0.6918746829032898, + "num_tokens": 201560188.0, + "step": 7957 + }, + { + "epoch": 0.8739292773995168, + "grad_norm": 2.015582799911499, + "learning_rate": 1e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.714091420173645, + "num_tokens": 201588116.0, + "step": 7958 + }, + { + "epoch": 0.8740390951021305, + "grad_norm": 1.964845895767212, + "learning_rate": 1e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.7085660099983215, + "num_tokens": 201617965.0, + "step": 7959 + }, + { + "epoch": 0.8741489128047442, + "grad_norm": 2.10191011428833, + "learning_rate": 1e-06, + "loss": 1.0849, + "mean_token_accuracy": 0.6785553097724915, + "num_tokens": 201647833.0, + "step": 7960 + }, + { + "epoch": 0.8742587305073578, + "grad_norm": 2.4147262573242188, + "learning_rate": 1e-06, + "loss": 0.9242, + "mean_token_accuracy": 0.7173927426338196, + "num_tokens": 201669525.0, + "step": 7961 + }, + { + "epoch": 0.8743685482099715, + "grad_norm": 2.0913643836975098, + "learning_rate": 1e-06, + "loss": 0.9779, + "mean_token_accuracy": 0.709012508392334, + "num_tokens": 201697650.0, + "step": 7962 + }, + { + "epoch": 0.8744783659125851, + "grad_norm": 2.0451483726501465, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.7028411030769348, + "num_tokens": 201724514.0, + "step": 7963 + }, + { + "epoch": 0.8745881836151987, + "grad_norm": 2.648691415786743, + "learning_rate": 1e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.7218530774116516, + "num_tokens": 201744754.0, + "step": 7964 + }, + { + "epoch": 0.8746980013178124, + "grad_norm": 2.164069652557373, + "learning_rate": 1e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.7243119478225708, + "num_tokens": 201769055.0, + "step": 7965 + }, + { + "epoch": 0.8748078190204261, + "grad_norm": 2.0886213779449463, + "learning_rate": 1e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.7136647701263428, + "num_tokens": 201796411.0, + "step": 7966 + }, + { + "epoch": 0.8749176367230398, + "grad_norm": 2.155442953109741, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7035385370254517, + "num_tokens": 201821596.0, + "step": 7967 + }, + { + "epoch": 0.8750274544256534, + "grad_norm": 1.9553624391555786, + "learning_rate": 1e-06, + "loss": 0.935, + "mean_token_accuracy": 0.7124463319778442, + "num_tokens": 201854389.0, + "step": 7968 + }, + { + "epoch": 0.8751372721282671, + "grad_norm": 2.3773276805877686, + "learning_rate": 1e-06, + "loss": 0.8842, + "mean_token_accuracy": 0.7218536138534546, + "num_tokens": 201875208.0, + "step": 7969 + }, + { + "epoch": 0.8752470898308807, + "grad_norm": 2.4370298385620117, + "learning_rate": 1e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.7248173356056213, + "num_tokens": 201895475.0, + "step": 7970 + }, + { + "epoch": 0.8753569075334944, + "grad_norm": 2.1531851291656494, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7205137610435486, + "num_tokens": 201922925.0, + "step": 7971 + }, + { + "epoch": 0.875466725236108, + "grad_norm": 2.249896287918091, + "learning_rate": 1e-06, + "loss": 1.0533, + "mean_token_accuracy": 0.6962433457374573, + "num_tokens": 201950423.0, + "step": 7972 + }, + { + "epoch": 0.8755765429387217, + "grad_norm": 1.9778391122817993, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.707091212272644, + "num_tokens": 201980895.0, + "step": 7973 + }, + { + "epoch": 0.8756863606413354, + "grad_norm": 2.1671669483184814, + "learning_rate": 1e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.7057889699935913, + "num_tokens": 202008594.0, + "step": 7974 + }, + { + "epoch": 0.8757961783439491, + "grad_norm": 2.278078556060791, + "learning_rate": 1e-06, + "loss": 1.0298, + "mean_token_accuracy": 0.6876308917999268, + "num_tokens": 202032324.0, + "step": 7975 + }, + { + "epoch": 0.8759059960465627, + "grad_norm": 2.0479347705841064, + "learning_rate": 1e-06, + "loss": 0.8957, + "mean_token_accuracy": 0.7211248278617859, + "num_tokens": 202060333.0, + "step": 7976 + }, + { + "epoch": 0.8760158137491764, + "grad_norm": 2.3021514415740967, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7095048427581787, + "num_tokens": 202084451.0, + "step": 7977 + }, + { + "epoch": 0.87612563145179, + "grad_norm": 2.549469470977783, + "learning_rate": 1e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.7156314849853516, + "num_tokens": 202103122.0, + "step": 7978 + }, + { + "epoch": 0.8762354491544037, + "grad_norm": 2.3864288330078125, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.6995651721954346, + "num_tokens": 202124419.0, + "step": 7979 + }, + { + "epoch": 0.8763452668570173, + "grad_norm": 2.3826568126678467, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.7195489406585693, + "num_tokens": 202150460.0, + "step": 7980 + }, + { + "epoch": 0.8764550845596311, + "grad_norm": 2.0546693801879883, + "learning_rate": 1e-06, + "loss": 1.0508, + "mean_token_accuracy": 0.68658846616745, + "num_tokens": 202179420.0, + "step": 7981 + }, + { + "epoch": 0.8765649022622447, + "grad_norm": 2.186687469482422, + "learning_rate": 1e-06, + "loss": 0.8702, + "mean_token_accuracy": 0.7239908576011658, + "num_tokens": 202203837.0, + "step": 7982 + }, + { + "epoch": 0.8766747199648584, + "grad_norm": 2.0937161445617676, + "learning_rate": 1e-06, + "loss": 0.939, + "mean_token_accuracy": 0.7123805284500122, + "num_tokens": 202232813.0, + "step": 7983 + }, + { + "epoch": 0.876784537667472, + "grad_norm": 2.2937090396881104, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.6998662948608398, + "num_tokens": 202255524.0, + "step": 7984 + }, + { + "epoch": 0.8768943553700856, + "grad_norm": 1.9686588048934937, + "learning_rate": 1e-06, + "loss": 1.0289, + "mean_token_accuracy": 0.6939647793769836, + "num_tokens": 202286009.0, + "step": 7985 + }, + { + "epoch": 0.8770041730726993, + "grad_norm": 2.0695297718048096, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7083731889724731, + "num_tokens": 202312143.0, + "step": 7986 + }, + { + "epoch": 0.8771139907753129, + "grad_norm": 2.3843889236450195, + "learning_rate": 1e-06, + "loss": 1.068, + "mean_token_accuracy": 0.6912757158279419, + "num_tokens": 202336173.0, + "step": 7987 + }, + { + "epoch": 0.8772238084779267, + "grad_norm": 2.0158281326293945, + "learning_rate": 1e-06, + "loss": 1.0109, + "mean_token_accuracy": 0.6934921741485596, + "num_tokens": 202366220.0, + "step": 7988 + }, + { + "epoch": 0.8773336261805403, + "grad_norm": 2.041966438293457, + "learning_rate": 1e-06, + "loss": 1.0295, + "mean_token_accuracy": 0.6936430931091309, + "num_tokens": 202394963.0, + "step": 7989 + }, + { + "epoch": 0.877443443883154, + "grad_norm": 2.3255276679992676, + "learning_rate": 1e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.7146559953689575, + "num_tokens": 202416992.0, + "step": 7990 + }, + { + "epoch": 0.8775532615857676, + "grad_norm": 2.2861595153808594, + "learning_rate": 1e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.6973081827163696, + "num_tokens": 202439682.0, + "step": 7991 + }, + { + "epoch": 0.8776630792883813, + "grad_norm": 2.231092691421509, + "learning_rate": 1e-06, + "loss": 0.9261, + "mean_token_accuracy": 0.7119972705841064, + "num_tokens": 202465080.0, + "step": 7992 + }, + { + "epoch": 0.8777728969909949, + "grad_norm": 2.0816311836242676, + "learning_rate": 1e-06, + "loss": 0.8551, + "mean_token_accuracy": 0.7328248620033264, + "num_tokens": 202490241.0, + "step": 7993 + }, + { + "epoch": 0.8778827146936086, + "grad_norm": 1.9548243284225464, + "learning_rate": 1e-06, + "loss": 1.0434, + "mean_token_accuracy": 0.6793522834777832, + "num_tokens": 202525163.0, + "step": 7994 + }, + { + "epoch": 0.8779925323962223, + "grad_norm": 2.2416210174560547, + "learning_rate": 1e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.6960241198539734, + "num_tokens": 202550226.0, + "step": 7995 + }, + { + "epoch": 0.878102350098836, + "grad_norm": 1.935872197151184, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.6903388500213623, + "num_tokens": 202584045.0, + "step": 7996 + }, + { + "epoch": 0.8782121678014496, + "grad_norm": 2.2158162593841553, + "learning_rate": 1e-06, + "loss": 0.9088, + "mean_token_accuracy": 0.7163465023040771, + "num_tokens": 202608246.0, + "step": 7997 + }, + { + "epoch": 0.8783219855040633, + "grad_norm": 2.150928258895874, + "learning_rate": 1e-06, + "loss": 0.9931, + "mean_token_accuracy": 0.6950064301490784, + "num_tokens": 202633441.0, + "step": 7998 + }, + { + "epoch": 0.8784318032066769, + "grad_norm": 2.073395013809204, + "learning_rate": 1e-06, + "loss": 1.0155, + "mean_token_accuracy": 0.6921801567077637, + "num_tokens": 202662885.0, + "step": 7999 + }, + { + "epoch": 0.8785416209092906, + "grad_norm": 2.4053878784179688, + "learning_rate": 1e-06, + "loss": 0.8995, + "mean_token_accuracy": 0.7219999432563782, + "num_tokens": 202682985.0, + "step": 8000 + }, + { + "epoch": 0.8786514386119042, + "grad_norm": 2.006464958190918, + "learning_rate": 1e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.698365330696106, + "num_tokens": 202712762.0, + "step": 8001 + }, + { + "epoch": 0.8787612563145178, + "grad_norm": 2.331775665283203, + "learning_rate": 1e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.7174654006958008, + "num_tokens": 202735002.0, + "step": 8002 + }, + { + "epoch": 0.8788710740171316, + "grad_norm": 2.616884231567383, + "learning_rate": 1e-06, + "loss": 0.923, + "mean_token_accuracy": 0.7167783975601196, + "num_tokens": 202753944.0, + "step": 8003 + }, + { + "epoch": 0.8789808917197452, + "grad_norm": 2.0618233680725098, + "learning_rate": 1e-06, + "loss": 1.0595, + "mean_token_accuracy": 0.6847874522209167, + "num_tokens": 202783404.0, + "step": 8004 + }, + { + "epoch": 0.8790907094223589, + "grad_norm": 2.141814947128296, + "learning_rate": 1e-06, + "loss": 0.8798, + "mean_token_accuracy": 0.721631646156311, + "num_tokens": 202808213.0, + "step": 8005 + }, + { + "epoch": 0.8792005271249725, + "grad_norm": 2.5891008377075195, + "learning_rate": 1e-06, + "loss": 0.8352, + "mean_token_accuracy": 0.7362134456634521, + "num_tokens": 202826025.0, + "step": 8006 + }, + { + "epoch": 0.8793103448275862, + "grad_norm": 2.321972370147705, + "learning_rate": 1e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.6971802115440369, + "num_tokens": 202849675.0, + "step": 8007 + }, + { + "epoch": 0.8794201625301998, + "grad_norm": 2.1184513568878174, + "learning_rate": 1e-06, + "loss": 0.943, + "mean_token_accuracy": 0.7128418684005737, + "num_tokens": 202875877.0, + "step": 8008 + }, + { + "epoch": 0.8795299802328135, + "grad_norm": 2.1805405616760254, + "learning_rate": 1e-06, + "loss": 0.8185, + "mean_token_accuracy": 0.7430984377861023, + "num_tokens": 202899086.0, + "step": 8009 + }, + { + "epoch": 0.8796397979354272, + "grad_norm": 2.078523635864258, + "learning_rate": 1e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.7111706733703613, + "num_tokens": 202924926.0, + "step": 8010 + }, + { + "epoch": 0.8797496156380409, + "grad_norm": 2.321721315383911, + "learning_rate": 1e-06, + "loss": 0.858, + "mean_token_accuracy": 0.7346460223197937, + "num_tokens": 202947885.0, + "step": 8011 + }, + { + "epoch": 0.8798594333406545, + "grad_norm": 2.1592588424682617, + "learning_rate": 1e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.7247757911682129, + "num_tokens": 202975270.0, + "step": 8012 + }, + { + "epoch": 0.8799692510432682, + "grad_norm": 2.1958160400390625, + "learning_rate": 1e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.7130640745162964, + "num_tokens": 202999696.0, + "step": 8013 + }, + { + "epoch": 0.8800790687458818, + "grad_norm": 2.167210578918457, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7097810506820679, + "num_tokens": 203027943.0, + "step": 8014 + }, + { + "epoch": 0.8801888864484955, + "grad_norm": 2.2338638305664062, + "learning_rate": 1e-06, + "loss": 0.9995, + "mean_token_accuracy": 0.7008575201034546, + "num_tokens": 203052608.0, + "step": 8015 + }, + { + "epoch": 0.8802987041511091, + "grad_norm": 2.0255775451660156, + "learning_rate": 1e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.7037562131881714, + "num_tokens": 203081267.0, + "step": 8016 + }, + { + "epoch": 0.8804085218537229, + "grad_norm": 2.5557525157928467, + "learning_rate": 1e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7167072296142578, + "num_tokens": 203102277.0, + "step": 8017 + }, + { + "epoch": 0.8805183395563365, + "grad_norm": 2.172339916229248, + "learning_rate": 1e-06, + "loss": 1.0004, + "mean_token_accuracy": 0.6940330862998962, + "num_tokens": 203127178.0, + "step": 8018 + }, + { + "epoch": 0.8806281572589502, + "grad_norm": 2.200939178466797, + "learning_rate": 1e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.7049373388290405, + "num_tokens": 203152560.0, + "step": 8019 + }, + { + "epoch": 0.8807379749615638, + "grad_norm": 2.0576860904693604, + "learning_rate": 1e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.7098170518875122, + "num_tokens": 203181264.0, + "step": 8020 + }, + { + "epoch": 0.8808477926641775, + "grad_norm": 2.1710751056671143, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7102208137512207, + "num_tokens": 203204883.0, + "step": 8021 + }, + { + "epoch": 0.8809576103667911, + "grad_norm": 2.45854115486145, + "learning_rate": 1e-06, + "loss": 0.9036, + "mean_token_accuracy": 0.730006754398346, + "num_tokens": 203225237.0, + "step": 8022 + }, + { + "epoch": 0.8810674280694047, + "grad_norm": 2.1288559436798096, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.7074528336524963, + "num_tokens": 203252759.0, + "step": 8023 + }, + { + "epoch": 0.8811772457720185, + "grad_norm": 2.0148212909698486, + "learning_rate": 1e-06, + "loss": 0.991, + "mean_token_accuracy": 0.7009437084197998, + "num_tokens": 203282385.0, + "step": 8024 + }, + { + "epoch": 0.8812870634746321, + "grad_norm": 2.0560784339904785, + "learning_rate": 1e-06, + "loss": 0.89, + "mean_token_accuracy": 0.7239022850990295, + "num_tokens": 203311433.0, + "step": 8025 + }, + { + "epoch": 0.8813968811772458, + "grad_norm": 2.2557363510131836, + "learning_rate": 1e-06, + "loss": 0.9493, + "mean_token_accuracy": 0.7136029005050659, + "num_tokens": 203335329.0, + "step": 8026 + }, + { + "epoch": 0.8815066988798594, + "grad_norm": 1.8831608295440674, + "learning_rate": 1e-06, + "loss": 0.9916, + "mean_token_accuracy": 0.6973138451576233, + "num_tokens": 203365349.0, + "step": 8027 + }, + { + "epoch": 0.8816165165824731, + "grad_norm": 2.117770195007324, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7200634479522705, + "num_tokens": 203391545.0, + "step": 8028 + }, + { + "epoch": 0.8817263342850867, + "grad_norm": 2.219860553741455, + "learning_rate": 1e-06, + "loss": 0.9308, + "mean_token_accuracy": 0.7117437124252319, + "num_tokens": 203415883.0, + "step": 8029 + }, + { + "epoch": 0.8818361519877004, + "grad_norm": 1.9546003341674805, + "learning_rate": 1e-06, + "loss": 0.9811, + "mean_token_accuracy": 0.7186511754989624, + "num_tokens": 203444843.0, + "step": 8030 + }, + { + "epoch": 0.881945969690314, + "grad_norm": 2.3008294105529785, + "learning_rate": 1e-06, + "loss": 0.8809, + "mean_token_accuracy": 0.7206275463104248, + "num_tokens": 203466803.0, + "step": 8031 + }, + { + "epoch": 0.8820557873929278, + "grad_norm": 2.6884255409240723, + "learning_rate": 1e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.7221753597259521, + "num_tokens": 203485538.0, + "step": 8032 + }, + { + "epoch": 0.8821656050955414, + "grad_norm": 2.1091437339782715, + "learning_rate": 1e-06, + "loss": 0.8923, + "mean_token_accuracy": 0.7183314561843872, + "num_tokens": 203512093.0, + "step": 8033 + }, + { + "epoch": 0.8822754227981551, + "grad_norm": 2.3172271251678467, + "learning_rate": 1e-06, + "loss": 0.8774, + "mean_token_accuracy": 0.7205483913421631, + "num_tokens": 203535275.0, + "step": 8034 + }, + { + "epoch": 0.8823852405007687, + "grad_norm": 2.199605941772461, + "learning_rate": 1e-06, + "loss": 0.9193, + "mean_token_accuracy": 0.7185223698616028, + "num_tokens": 203558827.0, + "step": 8035 + }, + { + "epoch": 0.8824950582033824, + "grad_norm": 2.387148141860962, + "learning_rate": 1e-06, + "loss": 0.9775, + "mean_token_accuracy": 0.7052716612815857, + "num_tokens": 203580102.0, + "step": 8036 + }, + { + "epoch": 0.882604875905996, + "grad_norm": 1.8851743936538696, + "learning_rate": 1e-06, + "loss": 0.9568, + "mean_token_accuracy": 0.7072957754135132, + "num_tokens": 203611507.0, + "step": 8037 + }, + { + "epoch": 0.8827146936086097, + "grad_norm": 2.273460626602173, + "learning_rate": 1e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.7052768468856812, + "num_tokens": 203633983.0, + "step": 8038 + }, + { + "epoch": 0.8828245113112234, + "grad_norm": 2.009446144104004, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7029489278793335, + "num_tokens": 203663671.0, + "step": 8039 + }, + { + "epoch": 0.8829343290138371, + "grad_norm": 2.0911784172058105, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7120717763900757, + "num_tokens": 203691097.0, + "step": 8040 + }, + { + "epoch": 0.8830441467164507, + "grad_norm": 2.0961945056915283, + "learning_rate": 1e-06, + "loss": 0.8835, + "mean_token_accuracy": 0.7272924184799194, + "num_tokens": 203715902.0, + "step": 8041 + }, + { + "epoch": 0.8831539644190644, + "grad_norm": 2.2159998416900635, + "learning_rate": 1e-06, + "loss": 1.0069, + "mean_token_accuracy": 0.6939427852630615, + "num_tokens": 203741853.0, + "step": 8042 + }, + { + "epoch": 0.883263782121678, + "grad_norm": 2.3670339584350586, + "learning_rate": 1e-06, + "loss": 0.8448, + "mean_token_accuracy": 0.7295007705688477, + "num_tokens": 203762631.0, + "step": 8043 + }, + { + "epoch": 0.8833735998242916, + "grad_norm": 2.0961341857910156, + "learning_rate": 1e-06, + "loss": 0.8977, + "mean_token_accuracy": 0.7231001853942871, + "num_tokens": 203788120.0, + "step": 8044 + }, + { + "epoch": 0.8834834175269053, + "grad_norm": 2.2953310012817383, + "learning_rate": 1e-06, + "loss": 1.005, + "mean_token_accuracy": 0.6939722299575806, + "num_tokens": 203811629.0, + "step": 8045 + }, + { + "epoch": 0.883593235229519, + "grad_norm": 2.4343926906585693, + "learning_rate": 1e-06, + "loss": 1.0022, + "mean_token_accuracy": 0.7085763812065125, + "num_tokens": 203835431.0, + "step": 8046 + }, + { + "epoch": 0.8837030529321327, + "grad_norm": 2.130316734313965, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.703579843044281, + "num_tokens": 203860000.0, + "step": 8047 + }, + { + "epoch": 0.8838128706347463, + "grad_norm": 1.9164563417434692, + "learning_rate": 1e-06, + "loss": 0.8895, + "mean_token_accuracy": 0.7289091348648071, + "num_tokens": 203890612.0, + "step": 8048 + }, + { + "epoch": 0.88392268833736, + "grad_norm": 2.3524601459503174, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7016624212265015, + "num_tokens": 203914795.0, + "step": 8049 + }, + { + "epoch": 0.8840325060399736, + "grad_norm": 2.079878091812134, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7099233269691467, + "num_tokens": 203941113.0, + "step": 8050 + }, + { + "epoch": 0.8841423237425873, + "grad_norm": 2.432000160217285, + "learning_rate": 1e-06, + "loss": 0.8177, + "mean_token_accuracy": 0.7424469590187073, + "num_tokens": 203960533.0, + "step": 8051 + }, + { + "epoch": 0.8842521414452009, + "grad_norm": 2.1105637550354004, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7180805206298828, + "num_tokens": 203986934.0, + "step": 8052 + }, + { + "epoch": 0.8843619591478147, + "grad_norm": 2.3350508213043213, + "learning_rate": 1e-06, + "loss": 0.984, + "mean_token_accuracy": 0.7036874294281006, + "num_tokens": 204009456.0, + "step": 8053 + }, + { + "epoch": 0.8844717768504283, + "grad_norm": 2.2783384323120117, + "learning_rate": 1e-06, + "loss": 0.9065, + "mean_token_accuracy": 0.7159978151321411, + "num_tokens": 204034160.0, + "step": 8054 + }, + { + "epoch": 0.884581594553042, + "grad_norm": 2.319676637649536, + "learning_rate": 1e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.70467209815979, + "num_tokens": 204057284.0, + "step": 8055 + }, + { + "epoch": 0.8846914122556556, + "grad_norm": 2.2642414569854736, + "learning_rate": 1e-06, + "loss": 0.8883, + "mean_token_accuracy": 0.723563551902771, + "num_tokens": 204080025.0, + "step": 8056 + }, + { + "epoch": 0.8848012299582693, + "grad_norm": 2.172908067703247, + "learning_rate": 1e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.7093200087547302, + "num_tokens": 204106402.0, + "step": 8057 + }, + { + "epoch": 0.8849110476608829, + "grad_norm": 1.7740521430969238, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.6985470652580261, + "num_tokens": 204142589.0, + "step": 8058 + }, + { + "epoch": 0.8850208653634966, + "grad_norm": 2.2528443336486816, + "learning_rate": 1e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.70863276720047, + "num_tokens": 204167214.0, + "step": 8059 + }, + { + "epoch": 0.8851306830661102, + "grad_norm": 1.9744855165481567, + "learning_rate": 1e-06, + "loss": 1.021, + "mean_token_accuracy": 0.6894935369491577, + "num_tokens": 204197923.0, + "step": 8060 + }, + { + "epoch": 0.885240500768724, + "grad_norm": 2.316542387008667, + "learning_rate": 1e-06, + "loss": 1.0186, + "mean_token_accuracy": 0.6926920413970947, + "num_tokens": 204222829.0, + "step": 8061 + }, + { + "epoch": 0.8853503184713376, + "grad_norm": 2.150719165802002, + "learning_rate": 1e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.7062076330184937, + "num_tokens": 204249691.0, + "step": 8062 + }, + { + "epoch": 0.8854601361739513, + "grad_norm": 1.9468704462051392, + "learning_rate": 1e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.714864194393158, + "num_tokens": 204278563.0, + "step": 8063 + }, + { + "epoch": 0.8855699538765649, + "grad_norm": 2.2288002967834473, + "learning_rate": 1e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.7185152173042297, + "num_tokens": 204304132.0, + "step": 8064 + }, + { + "epoch": 0.8856797715791785, + "grad_norm": 2.1572492122650146, + "learning_rate": 1e-06, + "loss": 1.0183, + "mean_token_accuracy": 0.6866428852081299, + "num_tokens": 204331183.0, + "step": 8065 + }, + { + "epoch": 0.8857895892817922, + "grad_norm": 2.114183187484741, + "learning_rate": 1e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7121858596801758, + "num_tokens": 204357728.0, + "step": 8066 + }, + { + "epoch": 0.8858994069844058, + "grad_norm": 2.1800851821899414, + "learning_rate": 1e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.7040301561355591, + "num_tokens": 204384082.0, + "step": 8067 + }, + { + "epoch": 0.8860092246870196, + "grad_norm": 2.2477307319641113, + "learning_rate": 1e-06, + "loss": 0.8594, + "mean_token_accuracy": 0.7344835996627808, + "num_tokens": 204407810.0, + "step": 8068 + }, + { + "epoch": 0.8861190423896332, + "grad_norm": 2.8201892375946045, + "learning_rate": 1e-06, + "loss": 1.0056, + "mean_token_accuracy": 0.6939430236816406, + "num_tokens": 204426487.0, + "step": 8069 + }, + { + "epoch": 0.8862288600922469, + "grad_norm": 2.3739233016967773, + "learning_rate": 1e-06, + "loss": 0.864, + "mean_token_accuracy": 0.73504239320755, + "num_tokens": 204447245.0, + "step": 8070 + }, + { + "epoch": 0.8863386777948605, + "grad_norm": 2.177598476409912, + "learning_rate": 1e-06, + "loss": 0.868, + "mean_token_accuracy": 0.7362405061721802, + "num_tokens": 204473805.0, + "step": 8071 + }, + { + "epoch": 0.8864484954974742, + "grad_norm": 2.498518943786621, + "learning_rate": 1e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.711128830909729, + "num_tokens": 204494415.0, + "step": 8072 + }, + { + "epoch": 0.8865583132000878, + "grad_norm": 2.064552068710327, + "learning_rate": 1e-06, + "loss": 0.8984, + "mean_token_accuracy": 0.7218068242073059, + "num_tokens": 204521889.0, + "step": 8073 + }, + { + "epoch": 0.8866681309027015, + "grad_norm": 2.1428945064544678, + "learning_rate": 1e-06, + "loss": 0.8792, + "mean_token_accuracy": 0.7322162389755249, + "num_tokens": 204548742.0, + "step": 8074 + }, + { + "epoch": 0.8867779486053152, + "grad_norm": 2.142435073852539, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7029911279678345, + "num_tokens": 204575748.0, + "step": 8075 + }, + { + "epoch": 0.8868877663079289, + "grad_norm": 2.0600168704986572, + "learning_rate": 1e-06, + "loss": 0.8704, + "mean_token_accuracy": 0.7277584075927734, + "num_tokens": 204600919.0, + "step": 8076 + }, + { + "epoch": 0.8869975840105425, + "grad_norm": 2.313847541809082, + "learning_rate": 1e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7027420997619629, + "num_tokens": 204625417.0, + "step": 8077 + }, + { + "epoch": 0.8871074017131562, + "grad_norm": 1.9784729480743408, + "learning_rate": 1e-06, + "loss": 1.0196, + "mean_token_accuracy": 0.6958238482475281, + "num_tokens": 204656192.0, + "step": 8078 + }, + { + "epoch": 0.8872172194157698, + "grad_norm": 2.427852153778076, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7075397372245789, + "num_tokens": 204677457.0, + "step": 8079 + }, + { + "epoch": 0.8873270371183835, + "grad_norm": 2.387056827545166, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.7006695866584778, + "num_tokens": 204697906.0, + "step": 8080 + }, + { + "epoch": 0.8874368548209971, + "grad_norm": 2.328477144241333, + "learning_rate": 1e-06, + "loss": 1.0669, + "mean_token_accuracy": 0.677254319190979, + "num_tokens": 204720088.0, + "step": 8081 + }, + { + "epoch": 0.8875466725236109, + "grad_norm": 2.4919676780700684, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7113687992095947, + "num_tokens": 204740363.0, + "step": 8082 + }, + { + "epoch": 0.8876564902262245, + "grad_norm": 2.4853312969207764, + "learning_rate": 1e-06, + "loss": 0.9138, + "mean_token_accuracy": 0.7189326286315918, + "num_tokens": 204760793.0, + "step": 8083 + }, + { + "epoch": 0.8877663079288381, + "grad_norm": 2.1349070072174072, + "learning_rate": 1e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.705726146697998, + "num_tokens": 204787552.0, + "step": 8084 + }, + { + "epoch": 0.8878761256314518, + "grad_norm": 2.244210958480835, + "learning_rate": 1e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.7090561985969543, + "num_tokens": 204810955.0, + "step": 8085 + }, + { + "epoch": 0.8879859433340654, + "grad_norm": 2.257833480834961, + "learning_rate": 1e-06, + "loss": 0.9268, + "mean_token_accuracy": 0.711876392364502, + "num_tokens": 204836301.0, + "step": 8086 + }, + { + "epoch": 0.8880957610366791, + "grad_norm": 2.111670732498169, + "learning_rate": 1e-06, + "loss": 1.0269, + "mean_token_accuracy": 0.6916447877883911, + "num_tokens": 204864761.0, + "step": 8087 + }, + { + "epoch": 0.8882055787392927, + "grad_norm": 2.4812817573547363, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7194557785987854, + "num_tokens": 204886154.0, + "step": 8088 + }, + { + "epoch": 0.8883153964419065, + "grad_norm": 2.449247121810913, + "learning_rate": 1e-06, + "loss": 0.9105, + "mean_token_accuracy": 0.7179999351501465, + "num_tokens": 204907623.0, + "step": 8089 + }, + { + "epoch": 0.8884252141445201, + "grad_norm": 1.986351728439331, + "learning_rate": 1e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.6982260942459106, + "num_tokens": 204939180.0, + "step": 8090 + }, + { + "epoch": 0.8885350318471338, + "grad_norm": 1.8463095426559448, + "learning_rate": 1e-06, + "loss": 1.0258, + "mean_token_accuracy": 0.6885620355606079, + "num_tokens": 204973196.0, + "step": 8091 + }, + { + "epoch": 0.8886448495497474, + "grad_norm": 2.1191506385803223, + "learning_rate": 1e-06, + "loss": 0.9743, + "mean_token_accuracy": 0.7027386426925659, + "num_tokens": 204999595.0, + "step": 8092 + }, + { + "epoch": 0.8887546672523611, + "grad_norm": 2.1553938388824463, + "learning_rate": 1e-06, + "loss": 0.9753, + "mean_token_accuracy": 0.7021211981773376, + "num_tokens": 205027521.0, + "step": 8093 + }, + { + "epoch": 0.8888644849549747, + "grad_norm": 1.971664547920227, + "learning_rate": 1e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.7156000137329102, + "num_tokens": 205056936.0, + "step": 8094 + }, + { + "epoch": 0.8889743026575884, + "grad_norm": 2.028050422668457, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7035903930664062, + "num_tokens": 205085949.0, + "step": 8095 + }, + { + "epoch": 0.889084120360202, + "grad_norm": 2.3685457706451416, + "learning_rate": 1e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.7169647216796875, + "num_tokens": 205109391.0, + "step": 8096 + }, + { + "epoch": 0.8891939380628158, + "grad_norm": 2.1187846660614014, + "learning_rate": 1e-06, + "loss": 0.9366, + "mean_token_accuracy": 0.7189685106277466, + "num_tokens": 205135621.0, + "step": 8097 + }, + { + "epoch": 0.8893037557654294, + "grad_norm": 2.1711812019348145, + "learning_rate": 1e-06, + "loss": 1.018, + "mean_token_accuracy": 0.6917717456817627, + "num_tokens": 205162030.0, + "step": 8098 + }, + { + "epoch": 0.8894135734680431, + "grad_norm": 2.2934675216674805, + "learning_rate": 1e-06, + "loss": 0.7556, + "mean_token_accuracy": 0.7572011947631836, + "num_tokens": 205182724.0, + "step": 8099 + }, + { + "epoch": 0.8895233911706567, + "grad_norm": 2.0449044704437256, + "learning_rate": 1e-06, + "loss": 1.0214, + "mean_token_accuracy": 0.6837604641914368, + "num_tokens": 205217075.0, + "step": 8100 + }, + { + "epoch": 0.8896332088732704, + "grad_norm": 2.0287227630615234, + "learning_rate": 1e-06, + "loss": 0.9954, + "mean_token_accuracy": 0.7017281651496887, + "num_tokens": 205244953.0, + "step": 8101 + }, + { + "epoch": 0.889743026575884, + "grad_norm": 2.3795809745788574, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7084611654281616, + "num_tokens": 205269018.0, + "step": 8102 + }, + { + "epoch": 0.8898528442784976, + "grad_norm": 1.8948343992233276, + "learning_rate": 1e-06, + "loss": 1.0053, + "mean_token_accuracy": 0.6941261291503906, + "num_tokens": 205302743.0, + "step": 8103 + }, + { + "epoch": 0.8899626619811114, + "grad_norm": 2.2323665618896484, + "learning_rate": 1e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.7027983665466309, + "num_tokens": 205327219.0, + "step": 8104 + }, + { + "epoch": 0.890072479683725, + "grad_norm": 2.079604148864746, + "learning_rate": 1e-06, + "loss": 1.0803, + "mean_token_accuracy": 0.6762822866439819, + "num_tokens": 205357959.0, + "step": 8105 + }, + { + "epoch": 0.8901822973863387, + "grad_norm": 1.9406652450561523, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.6992961168289185, + "num_tokens": 205386636.0, + "step": 8106 + }, + { + "epoch": 0.8902921150889523, + "grad_norm": 2.2094368934631348, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7123268246650696, + "num_tokens": 205411948.0, + "step": 8107 + }, + { + "epoch": 0.890401932791566, + "grad_norm": 2.6336629390716553, + "learning_rate": 1e-06, + "loss": 0.874, + "mean_token_accuracy": 0.7288891673088074, + "num_tokens": 205429991.0, + "step": 8108 + }, + { + "epoch": 0.8905117504941796, + "grad_norm": 1.9183446168899536, + "learning_rate": 1e-06, + "loss": 1.0484, + "mean_token_accuracy": 0.6904858350753784, + "num_tokens": 205460548.0, + "step": 8109 + }, + { + "epoch": 0.8906215681967933, + "grad_norm": 2.1528985500335693, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7113697528839111, + "num_tokens": 205486167.0, + "step": 8110 + }, + { + "epoch": 0.890731385899407, + "grad_norm": 2.3425002098083496, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7034538388252258, + "num_tokens": 205511348.0, + "step": 8111 + }, + { + "epoch": 0.8908412036020207, + "grad_norm": 2.075094223022461, + "learning_rate": 1e-06, + "loss": 0.9309, + "mean_token_accuracy": 0.726407527923584, + "num_tokens": 205537739.0, + "step": 8112 + }, + { + "epoch": 0.8909510213046343, + "grad_norm": 2.561270236968994, + "learning_rate": 1e-06, + "loss": 0.8757, + "mean_token_accuracy": 0.7270275354385376, + "num_tokens": 205556789.0, + "step": 8113 + }, + { + "epoch": 0.891060839007248, + "grad_norm": 2.533254384994507, + "learning_rate": 1e-06, + "loss": 0.8879, + "mean_token_accuracy": 0.7249671816825867, + "num_tokens": 205576388.0, + "step": 8114 + }, + { + "epoch": 0.8911706567098616, + "grad_norm": 1.8607661724090576, + "learning_rate": 1e-06, + "loss": 0.9851, + "mean_token_accuracy": 0.6968812942504883, + "num_tokens": 205608506.0, + "step": 8115 + }, + { + "epoch": 0.8912804744124753, + "grad_norm": 1.9655753374099731, + "learning_rate": 1e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.7329425811767578, + "num_tokens": 205636341.0, + "step": 8116 + }, + { + "epoch": 0.8913902921150889, + "grad_norm": 2.312335968017578, + "learning_rate": 1e-06, + "loss": 0.8484, + "mean_token_accuracy": 0.7297260761260986, + "num_tokens": 205658128.0, + "step": 8117 + }, + { + "epoch": 0.8915001098177027, + "grad_norm": 2.247955560684204, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.6967820525169373, + "num_tokens": 205681937.0, + "step": 8118 + }, + { + "epoch": 0.8916099275203163, + "grad_norm": 2.1977431774139404, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7162196636199951, + "num_tokens": 205708252.0, + "step": 8119 + }, + { + "epoch": 0.89171974522293, + "grad_norm": 2.1818532943725586, + "learning_rate": 1e-06, + "loss": 0.9349, + "mean_token_accuracy": 0.712494969367981, + "num_tokens": 205734781.0, + "step": 8120 + }, + { + "epoch": 0.8918295629255436, + "grad_norm": 2.01827073097229, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.7012938857078552, + "num_tokens": 205764242.0, + "step": 8121 + }, + { + "epoch": 0.8919393806281573, + "grad_norm": 2.051084041595459, + "learning_rate": 1e-06, + "loss": 1.0128, + "mean_token_accuracy": 0.7037005424499512, + "num_tokens": 205791719.0, + "step": 8122 + }, + { + "epoch": 0.8920491983307709, + "grad_norm": 2.1994500160217285, + "learning_rate": 1e-06, + "loss": 0.9926, + "mean_token_accuracy": 0.7033206224441528, + "num_tokens": 205816109.0, + "step": 8123 + }, + { + "epoch": 0.8921590160333845, + "grad_norm": 2.5690758228302, + "learning_rate": 1e-06, + "loss": 0.9014, + "mean_token_accuracy": 0.7154045104980469, + "num_tokens": 205834672.0, + "step": 8124 + }, + { + "epoch": 0.8922688337359982, + "grad_norm": 2.126574993133545, + "learning_rate": 1e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.7272225618362427, + "num_tokens": 205860669.0, + "step": 8125 + }, + { + "epoch": 0.892378651438612, + "grad_norm": 2.290093421936035, + "learning_rate": 1e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.7125728130340576, + "num_tokens": 205882711.0, + "step": 8126 + }, + { + "epoch": 0.8924884691412256, + "grad_norm": 2.2359089851379395, + "learning_rate": 1e-06, + "loss": 0.9051, + "mean_token_accuracy": 0.722719132900238, + "num_tokens": 205905084.0, + "step": 8127 + }, + { + "epoch": 0.8925982868438392, + "grad_norm": 2.405087947845459, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.717284083366394, + "num_tokens": 205926536.0, + "step": 8128 + }, + { + "epoch": 0.8927081045464529, + "grad_norm": 2.4952995777130127, + "learning_rate": 1e-06, + "loss": 0.8288, + "mean_token_accuracy": 0.739810049533844, + "num_tokens": 205945690.0, + "step": 8129 + }, + { + "epoch": 0.8928179222490665, + "grad_norm": 2.0376365184783936, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.712893009185791, + "num_tokens": 205972760.0, + "step": 8130 + }, + { + "epoch": 0.8929277399516802, + "grad_norm": 2.5821166038513184, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.7037490606307983, + "num_tokens": 205993214.0, + "step": 8131 + }, + { + "epoch": 0.8930375576542938, + "grad_norm": 1.7460132837295532, + "learning_rate": 1e-06, + "loss": 0.9971, + "mean_token_accuracy": 0.703534722328186, + "num_tokens": 206032161.0, + "step": 8132 + }, + { + "epoch": 0.8931473753569076, + "grad_norm": 2.3755056858062744, + "learning_rate": 1e-06, + "loss": 1.0803, + "mean_token_accuracy": 0.679756760597229, + "num_tokens": 206059392.0, + "step": 8133 + }, + { + "epoch": 0.8932571930595212, + "grad_norm": 2.0627009868621826, + "learning_rate": 1e-06, + "loss": 1.0458, + "mean_token_accuracy": 0.6833131909370422, + "num_tokens": 206088622.0, + "step": 8134 + }, + { + "epoch": 0.8933670107621349, + "grad_norm": 2.0983924865722656, + "learning_rate": 1e-06, + "loss": 1.0421, + "mean_token_accuracy": 0.6839284896850586, + "num_tokens": 206116499.0, + "step": 8135 + }, + { + "epoch": 0.8934768284647485, + "grad_norm": 2.229752779006958, + "learning_rate": 1e-06, + "loss": 0.9272, + "mean_token_accuracy": 0.7182707786560059, + "num_tokens": 206141805.0, + "step": 8136 + }, + { + "epoch": 0.8935866461673622, + "grad_norm": 2.07328724861145, + "learning_rate": 1e-06, + "loss": 0.9629, + "mean_token_accuracy": 0.710098147392273, + "num_tokens": 206170068.0, + "step": 8137 + }, + { + "epoch": 0.8936964638699758, + "grad_norm": 1.9564391374588013, + "learning_rate": 1e-06, + "loss": 0.9158, + "mean_token_accuracy": 0.7198669910430908, + "num_tokens": 206199738.0, + "step": 8138 + }, + { + "epoch": 0.8938062815725895, + "grad_norm": 2.0684916973114014, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.7020970582962036, + "num_tokens": 206227707.0, + "step": 8139 + }, + { + "epoch": 0.8939160992752032, + "grad_norm": 2.362424373626709, + "learning_rate": 1e-06, + "loss": 0.9449, + "mean_token_accuracy": 0.7107280492782593, + "num_tokens": 206250760.0, + "step": 8140 + }, + { + "epoch": 0.8940259169778169, + "grad_norm": 2.256193161010742, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.6984546184539795, + "num_tokens": 206274278.0, + "step": 8141 + }, + { + "epoch": 0.8941357346804305, + "grad_norm": 2.396336555480957, + "learning_rate": 1e-06, + "loss": 0.9407, + "mean_token_accuracy": 0.7143726944923401, + "num_tokens": 206295332.0, + "step": 8142 + }, + { + "epoch": 0.8942455523830442, + "grad_norm": 2.4063546657562256, + "learning_rate": 1e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.7076636552810669, + "num_tokens": 206317887.0, + "step": 8143 + }, + { + "epoch": 0.8943553700856578, + "grad_norm": 1.8330578804016113, + "learning_rate": 1e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.6910728812217712, + "num_tokens": 206354164.0, + "step": 8144 + }, + { + "epoch": 0.8944651877882714, + "grad_norm": 1.8968427181243896, + "learning_rate": 1e-06, + "loss": 1.0042, + "mean_token_accuracy": 0.697075605392456, + "num_tokens": 206386596.0, + "step": 8145 + }, + { + "epoch": 0.8945750054908851, + "grad_norm": 1.9582067728042603, + "learning_rate": 1e-06, + "loss": 0.9054, + "mean_token_accuracy": 0.7157386541366577, + "num_tokens": 206415332.0, + "step": 8146 + }, + { + "epoch": 0.8946848231934988, + "grad_norm": 2.204608201980591, + "learning_rate": 1e-06, + "loss": 1.0707, + "mean_token_accuracy": 0.6824336051940918, + "num_tokens": 206444094.0, + "step": 8147 + }, + { + "epoch": 0.8947946408961125, + "grad_norm": 2.3883426189422607, + "learning_rate": 1e-06, + "loss": 0.9375, + "mean_token_accuracy": 0.7198821902275085, + "num_tokens": 206465627.0, + "step": 8148 + }, + { + "epoch": 0.8949044585987261, + "grad_norm": 2.672886371612549, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7029323577880859, + "num_tokens": 206484023.0, + "step": 8149 + }, + { + "epoch": 0.8950142763013398, + "grad_norm": 2.279224395751953, + "learning_rate": 1e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.7049355506896973, + "num_tokens": 206507829.0, + "step": 8150 + }, + { + "epoch": 0.8951240940039534, + "grad_norm": 2.0608584880828857, + "learning_rate": 1e-06, + "loss": 0.9805, + "mean_token_accuracy": 0.7052679061889648, + "num_tokens": 206535405.0, + "step": 8151 + }, + { + "epoch": 0.8952339117065671, + "grad_norm": 2.180720090866089, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7079876065254211, + "num_tokens": 206562645.0, + "step": 8152 + }, + { + "epoch": 0.8953437294091807, + "grad_norm": 2.2013602256774902, + "learning_rate": 1e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.7140370607376099, + "num_tokens": 206588254.0, + "step": 8153 + }, + { + "epoch": 0.8954535471117944, + "grad_norm": 2.0797345638275146, + "learning_rate": 1e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.7111543416976929, + "num_tokens": 206614917.0, + "step": 8154 + }, + { + "epoch": 0.8955633648144081, + "grad_norm": 2.0902156829833984, + "learning_rate": 1e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.7119824886322021, + "num_tokens": 206640333.0, + "step": 8155 + }, + { + "epoch": 0.8956731825170218, + "grad_norm": 2.095128297805786, + "learning_rate": 1e-06, + "loss": 0.8708, + "mean_token_accuracy": 0.7244844436645508, + "num_tokens": 206664641.0, + "step": 8156 + }, + { + "epoch": 0.8957830002196354, + "grad_norm": 2.103330612182617, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.7140811085700989, + "num_tokens": 206690876.0, + "step": 8157 + }, + { + "epoch": 0.8958928179222491, + "grad_norm": 2.139873743057251, + "learning_rate": 1e-06, + "loss": 0.9183, + "mean_token_accuracy": 0.720035970211029, + "num_tokens": 206714850.0, + "step": 8158 + }, + { + "epoch": 0.8960026356248627, + "grad_norm": 2.3155806064605713, + "learning_rate": 1e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7161101698875427, + "num_tokens": 206738680.0, + "step": 8159 + }, + { + "epoch": 0.8961124533274764, + "grad_norm": 2.1265695095062256, + "learning_rate": 1e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.6989884972572327, + "num_tokens": 206765283.0, + "step": 8160 + }, + { + "epoch": 0.89622227103009, + "grad_norm": 2.349170446395874, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.7034446001052856, + "num_tokens": 206786425.0, + "step": 8161 + }, + { + "epoch": 0.8963320887327038, + "grad_norm": 2.0390572547912598, + "learning_rate": 1e-06, + "loss": 0.9168, + "mean_token_accuracy": 0.7138177156448364, + "num_tokens": 206814467.0, + "step": 8162 + }, + { + "epoch": 0.8964419064353174, + "grad_norm": 2.624739170074463, + "learning_rate": 1e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7220473289489746, + "num_tokens": 206833373.0, + "step": 8163 + }, + { + "epoch": 0.896551724137931, + "grad_norm": 2.3044698238372803, + "learning_rate": 1e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.7110581994056702, + "num_tokens": 206855388.0, + "step": 8164 + }, + { + "epoch": 0.8966615418405447, + "grad_norm": 2.089275598526001, + "learning_rate": 1e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.6958950161933899, + "num_tokens": 206882264.0, + "step": 8165 + }, + { + "epoch": 0.8967713595431583, + "grad_norm": 2.2391042709350586, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7272186279296875, + "num_tokens": 206907454.0, + "step": 8166 + }, + { + "epoch": 0.896881177245772, + "grad_norm": 2.0182735919952393, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7113202810287476, + "num_tokens": 206935711.0, + "step": 8167 + }, + { + "epoch": 0.8969909949483856, + "grad_norm": 1.978796362876892, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.70229172706604, + "num_tokens": 206967601.0, + "step": 8168 + }, + { + "epoch": 0.8971008126509994, + "grad_norm": 2.216174840927124, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7123361229896545, + "num_tokens": 206991030.0, + "step": 8169 + }, + { + "epoch": 0.897210630353613, + "grad_norm": 2.132449150085449, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.7130485773086548, + "num_tokens": 207015881.0, + "step": 8170 + }, + { + "epoch": 0.8973204480562267, + "grad_norm": 2.1430652141571045, + "learning_rate": 1e-06, + "loss": 1.0098, + "mean_token_accuracy": 0.6931891441345215, + "num_tokens": 207044590.0, + "step": 8171 + }, + { + "epoch": 0.8974302657588403, + "grad_norm": 2.226414918899536, + "learning_rate": 1e-06, + "loss": 0.8573, + "mean_token_accuracy": 0.7310279607772827, + "num_tokens": 207067609.0, + "step": 8172 + }, + { + "epoch": 0.897540083461454, + "grad_norm": 2.4458374977111816, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.6984940767288208, + "num_tokens": 207088455.0, + "step": 8173 + }, + { + "epoch": 0.8976499011640676, + "grad_norm": 2.2314319610595703, + "learning_rate": 1e-06, + "loss": 0.925, + "mean_token_accuracy": 0.7163641452789307, + "num_tokens": 207113217.0, + "step": 8174 + }, + { + "epoch": 0.8977597188666813, + "grad_norm": 2.1508588790893555, + "learning_rate": 1e-06, + "loss": 1.0141, + "mean_token_accuracy": 0.6917951107025146, + "num_tokens": 207139971.0, + "step": 8175 + }, + { + "epoch": 0.897869536569295, + "grad_norm": 2.3232617378234863, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.6976941823959351, + "num_tokens": 207163049.0, + "step": 8176 + }, + { + "epoch": 0.8979793542719087, + "grad_norm": 2.099374771118164, + "learning_rate": 1e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7073661088943481, + "num_tokens": 207189443.0, + "step": 8177 + }, + { + "epoch": 0.8980891719745223, + "grad_norm": 2.222275972366333, + "learning_rate": 1e-06, + "loss": 1.0351, + "mean_token_accuracy": 0.6874983310699463, + "num_tokens": 207216054.0, + "step": 8178 + }, + { + "epoch": 0.898198989677136, + "grad_norm": 2.4978318214416504, + "learning_rate": 1e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.7118943929672241, + "num_tokens": 207235718.0, + "step": 8179 + }, + { + "epoch": 0.8983088073797496, + "grad_norm": 2.1699984073638916, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7023565769195557, + "num_tokens": 207261870.0, + "step": 8180 + }, + { + "epoch": 0.8984186250823633, + "grad_norm": 2.377786159515381, + "learning_rate": 1e-06, + "loss": 0.8759, + "mean_token_accuracy": 0.7366201877593994, + "num_tokens": 207284526.0, + "step": 8181 + }, + { + "epoch": 0.8985284427849769, + "grad_norm": 2.225694179534912, + "learning_rate": 1e-06, + "loss": 0.9773, + "mean_token_accuracy": 0.6980230808258057, + "num_tokens": 207309920.0, + "step": 8182 + }, + { + "epoch": 0.8986382604875905, + "grad_norm": 2.0566186904907227, + "learning_rate": 1e-06, + "loss": 0.9832, + "mean_token_accuracy": 0.6959406733512878, + "num_tokens": 207339277.0, + "step": 8183 + }, + { + "epoch": 0.8987480781902043, + "grad_norm": 1.9989904165267944, + "learning_rate": 1e-06, + "loss": 1.0577, + "mean_token_accuracy": 0.690443754196167, + "num_tokens": 207369996.0, + "step": 8184 + }, + { + "epoch": 0.898857895892818, + "grad_norm": 2.49491548538208, + "learning_rate": 1e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.7050231099128723, + "num_tokens": 207393035.0, + "step": 8185 + }, + { + "epoch": 0.8989677135954316, + "grad_norm": 2.00759220123291, + "learning_rate": 1e-06, + "loss": 1.0099, + "mean_token_accuracy": 0.687825083732605, + "num_tokens": 207427251.0, + "step": 8186 + }, + { + "epoch": 0.8990775312980452, + "grad_norm": 2.456660270690918, + "learning_rate": 1e-06, + "loss": 0.94, + "mean_token_accuracy": 0.7082223296165466, + "num_tokens": 207449332.0, + "step": 8187 + }, + { + "epoch": 0.8991873490006589, + "grad_norm": 2.037308931350708, + "learning_rate": 1e-06, + "loss": 1.0131, + "mean_token_accuracy": 0.700137734413147, + "num_tokens": 207479448.0, + "step": 8188 + }, + { + "epoch": 0.8992971667032725, + "grad_norm": 2.1330246925354004, + "learning_rate": 1e-06, + "loss": 0.9915, + "mean_token_accuracy": 0.7006821632385254, + "num_tokens": 207507776.0, + "step": 8189 + }, + { + "epoch": 0.8994069844058862, + "grad_norm": 2.157230854034424, + "learning_rate": 1e-06, + "loss": 0.9865, + "mean_token_accuracy": 0.6978203058242798, + "num_tokens": 207535585.0, + "step": 8190 + }, + { + "epoch": 0.8995168021084999, + "grad_norm": 2.2683980464935303, + "learning_rate": 1e-06, + "loss": 0.8692, + "mean_token_accuracy": 0.7290077209472656, + "num_tokens": 207558934.0, + "step": 8191 + }, + { + "epoch": 0.8996266198111136, + "grad_norm": 2.10300874710083, + "learning_rate": 1e-06, + "loss": 1.0251, + "mean_token_accuracy": 0.6934195756912231, + "num_tokens": 207586818.0, + "step": 8192 + }, + { + "epoch": 0.8997364375137272, + "grad_norm": 2.096529245376587, + "learning_rate": 1e-06, + "loss": 0.8438, + "mean_token_accuracy": 0.7385222911834717, + "num_tokens": 207611836.0, + "step": 8193 + }, + { + "epoch": 0.8998462552163409, + "grad_norm": 2.172062635421753, + "learning_rate": 1e-06, + "loss": 0.9691, + "mean_token_accuracy": 0.6992630362510681, + "num_tokens": 207638477.0, + "step": 8194 + }, + { + "epoch": 0.8999560729189545, + "grad_norm": 2.102804183959961, + "learning_rate": 1e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.7088676691055298, + "num_tokens": 207665349.0, + "step": 8195 + }, + { + "epoch": 0.9000658906215682, + "grad_norm": 2.2775092124938965, + "learning_rate": 1e-06, + "loss": 1.0132, + "mean_token_accuracy": 0.6966590881347656, + "num_tokens": 207692362.0, + "step": 8196 + }, + { + "epoch": 0.9001757083241818, + "grad_norm": 2.108022689819336, + "learning_rate": 1e-06, + "loss": 0.9853, + "mean_token_accuracy": 0.7150828838348389, + "num_tokens": 207719287.0, + "step": 8197 + }, + { + "epoch": 0.9002855260267956, + "grad_norm": 2.2255046367645264, + "learning_rate": 1e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.6998176574707031, + "num_tokens": 207744975.0, + "step": 8198 + }, + { + "epoch": 0.9003953437294092, + "grad_norm": 1.9267319440841675, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.7076431512832642, + "num_tokens": 207774763.0, + "step": 8199 + }, + { + "epoch": 0.9005051614320229, + "grad_norm": 2.1332108974456787, + "learning_rate": 1e-06, + "loss": 1.0009, + "mean_token_accuracy": 0.696317195892334, + "num_tokens": 207802214.0, + "step": 8200 + }, + { + "epoch": 0.9006149791346365, + "grad_norm": 2.2907588481903076, + "learning_rate": 1e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.7004480361938477, + "num_tokens": 207826465.0, + "step": 8201 + }, + { + "epoch": 0.9007247968372502, + "grad_norm": 2.301938772201538, + "learning_rate": 1e-06, + "loss": 0.9407, + "mean_token_accuracy": 0.7096577882766724, + "num_tokens": 207850608.0, + "step": 8202 + }, + { + "epoch": 0.9008346145398638, + "grad_norm": 2.1161582469940186, + "learning_rate": 1e-06, + "loss": 1.0021, + "mean_token_accuracy": 0.6938715577125549, + "num_tokens": 207878721.0, + "step": 8203 + }, + { + "epoch": 0.9009444322424774, + "grad_norm": 2.0831727981567383, + "learning_rate": 1e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.713843822479248, + "num_tokens": 207906732.0, + "step": 8204 + }, + { + "epoch": 0.9010542499450912, + "grad_norm": 2.449246406555176, + "learning_rate": 1e-06, + "loss": 0.8823, + "mean_token_accuracy": 0.733283281326294, + "num_tokens": 207926191.0, + "step": 8205 + }, + { + "epoch": 0.9011640676477048, + "grad_norm": 2.1327245235443115, + "learning_rate": 1e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7052291035652161, + "num_tokens": 207950927.0, + "step": 8206 + }, + { + "epoch": 0.9012738853503185, + "grad_norm": 2.086604356765747, + "learning_rate": 1e-06, + "loss": 0.8774, + "mean_token_accuracy": 0.7260241508483887, + "num_tokens": 207978423.0, + "step": 8207 + }, + { + "epoch": 0.9013837030529321, + "grad_norm": 2.329963445663452, + "learning_rate": 1e-06, + "loss": 0.8917, + "mean_token_accuracy": 0.71971595287323, + "num_tokens": 208000872.0, + "step": 8208 + }, + { + "epoch": 0.9014935207555458, + "grad_norm": 2.007550001144409, + "learning_rate": 1e-06, + "loss": 1.0622, + "mean_token_accuracy": 0.6968937516212463, + "num_tokens": 208031765.0, + "step": 8209 + }, + { + "epoch": 0.9016033384581594, + "grad_norm": 2.077359199523926, + "learning_rate": 1e-06, + "loss": 1.0022, + "mean_token_accuracy": 0.7093386054039001, + "num_tokens": 208059639.0, + "step": 8210 + }, + { + "epoch": 0.9017131561607731, + "grad_norm": 1.965735912322998, + "learning_rate": 1e-06, + "loss": 1.0351, + "mean_token_accuracy": 0.6936293244361877, + "num_tokens": 208089342.0, + "step": 8211 + }, + { + "epoch": 0.9018229738633867, + "grad_norm": 2.413534641265869, + "learning_rate": 1e-06, + "loss": 0.8949, + "mean_token_accuracy": 0.7217063307762146, + "num_tokens": 208109629.0, + "step": 8212 + }, + { + "epoch": 0.9019327915660005, + "grad_norm": 3.3238725662231445, + "learning_rate": 1e-06, + "loss": 0.7607, + "mean_token_accuracy": 0.7507026195526123, + "num_tokens": 208121858.0, + "step": 8213 + }, + { + "epoch": 0.9020426092686141, + "grad_norm": 2.456441640853882, + "learning_rate": 1e-06, + "loss": 1.0015, + "mean_token_accuracy": 0.6925801038742065, + "num_tokens": 208144693.0, + "step": 8214 + }, + { + "epoch": 0.9021524269712278, + "grad_norm": 2.4611260890960693, + "learning_rate": 1e-06, + "loss": 0.8886, + "mean_token_accuracy": 0.7272186279296875, + "num_tokens": 208164129.0, + "step": 8215 + }, + { + "epoch": 0.9022622446738414, + "grad_norm": 2.373905897140503, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7159177660942078, + "num_tokens": 208186384.0, + "step": 8216 + }, + { + "epoch": 0.9023720623764551, + "grad_norm": 2.1516213417053223, + "learning_rate": 1e-06, + "loss": 1.0214, + "mean_token_accuracy": 0.689477801322937, + "num_tokens": 208212247.0, + "step": 8217 + }, + { + "epoch": 0.9024818800790687, + "grad_norm": 2.091686248779297, + "learning_rate": 1e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.6922590732574463, + "num_tokens": 208239821.0, + "step": 8218 + }, + { + "epoch": 0.9025916977816824, + "grad_norm": 2.391202688217163, + "learning_rate": 1e-06, + "loss": 0.9133, + "mean_token_accuracy": 0.7268059849739075, + "num_tokens": 208260308.0, + "step": 8219 + }, + { + "epoch": 0.9027015154842961, + "grad_norm": 2.143266201019287, + "learning_rate": 1e-06, + "loss": 1.0568, + "mean_token_accuracy": 0.6848922371864319, + "num_tokens": 208287027.0, + "step": 8220 + }, + { + "epoch": 0.9028113331869098, + "grad_norm": 2.2017974853515625, + "learning_rate": 1e-06, + "loss": 0.8715, + "mean_token_accuracy": 0.7314590215682983, + "num_tokens": 208312001.0, + "step": 8221 + }, + { + "epoch": 0.9029211508895234, + "grad_norm": 2.4487359523773193, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7280622720718384, + "num_tokens": 208331974.0, + "step": 8222 + }, + { + "epoch": 0.903030968592137, + "grad_norm": 1.9032719135284424, + "learning_rate": 1e-06, + "loss": 0.982, + "mean_token_accuracy": 0.7000527381896973, + "num_tokens": 208360634.0, + "step": 8223 + }, + { + "epoch": 0.9031407862947507, + "grad_norm": 2.1926026344299316, + "learning_rate": 1e-06, + "loss": 0.8907, + "mean_token_accuracy": 0.7262842655181885, + "num_tokens": 208384537.0, + "step": 8224 + }, + { + "epoch": 0.9032506039973643, + "grad_norm": 2.3919525146484375, + "learning_rate": 1e-06, + "loss": 0.9105, + "mean_token_accuracy": 0.7167114019393921, + "num_tokens": 208405889.0, + "step": 8225 + }, + { + "epoch": 0.903360421699978, + "grad_norm": 2.5103745460510254, + "learning_rate": 1e-06, + "loss": 0.9614, + "mean_token_accuracy": 0.7048441171646118, + "num_tokens": 208426266.0, + "step": 8226 + }, + { + "epoch": 0.9034702394025917, + "grad_norm": 2.04830002784729, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.6957193613052368, + "num_tokens": 208455818.0, + "step": 8227 + }, + { + "epoch": 0.9035800571052054, + "grad_norm": 2.19622802734375, + "learning_rate": 1e-06, + "loss": 0.958, + "mean_token_accuracy": 0.7043105363845825, + "num_tokens": 208482229.0, + "step": 8228 + }, + { + "epoch": 0.903689874807819, + "grad_norm": 2.288961410522461, + "learning_rate": 1e-06, + "loss": 0.9567, + "mean_token_accuracy": 0.7123804092407227, + "num_tokens": 208506211.0, + "step": 8229 + }, + { + "epoch": 0.9037996925104327, + "grad_norm": 2.16117262840271, + "learning_rate": 1e-06, + "loss": 0.9645, + "mean_token_accuracy": 0.709281325340271, + "num_tokens": 208533089.0, + "step": 8230 + }, + { + "epoch": 0.9039095102130463, + "grad_norm": 2.4274544715881348, + "learning_rate": 1e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.705154538154602, + "num_tokens": 208554807.0, + "step": 8231 + }, + { + "epoch": 0.90401932791566, + "grad_norm": 2.0404160022735596, + "learning_rate": 1e-06, + "loss": 1.0565, + "mean_token_accuracy": 0.6824778914451599, + "num_tokens": 208584133.0, + "step": 8232 + }, + { + "epoch": 0.9041291456182736, + "grad_norm": 2.087306499481201, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7094743847846985, + "num_tokens": 208610301.0, + "step": 8233 + }, + { + "epoch": 0.9042389633208874, + "grad_norm": 2.308401584625244, + "learning_rate": 1e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.7338733673095703, + "num_tokens": 208631848.0, + "step": 8234 + }, + { + "epoch": 0.904348781023501, + "grad_norm": 2.1261589527130127, + "learning_rate": 1e-06, + "loss": 0.9873, + "mean_token_accuracy": 0.6960143446922302, + "num_tokens": 208656637.0, + "step": 8235 + }, + { + "epoch": 0.9044585987261147, + "grad_norm": 2.2631478309631348, + "learning_rate": 1e-06, + "loss": 0.9011, + "mean_token_accuracy": 0.7158088684082031, + "num_tokens": 208680066.0, + "step": 8236 + }, + { + "epoch": 0.9045684164287283, + "grad_norm": 2.3416316509246826, + "learning_rate": 1e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.7150756120681763, + "num_tokens": 208703928.0, + "step": 8237 + }, + { + "epoch": 0.904678234131342, + "grad_norm": 2.082360029220581, + "learning_rate": 1e-06, + "loss": 0.9774, + "mean_token_accuracy": 0.697670578956604, + "num_tokens": 208731790.0, + "step": 8238 + }, + { + "epoch": 0.9047880518339556, + "grad_norm": 2.2694146633148193, + "learning_rate": 1e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7112518548965454, + "num_tokens": 208755832.0, + "step": 8239 + }, + { + "epoch": 0.9048978695365693, + "grad_norm": 1.933162808418274, + "learning_rate": 1e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.7021661996841431, + "num_tokens": 208789471.0, + "step": 8240 + }, + { + "epoch": 0.9050076872391829, + "grad_norm": 2.1721246242523193, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.7022826075553894, + "num_tokens": 208817890.0, + "step": 8241 + }, + { + "epoch": 0.9051175049417967, + "grad_norm": 2.297757148742676, + "learning_rate": 1e-06, + "loss": 1.0079, + "mean_token_accuracy": 0.6877502202987671, + "num_tokens": 208844718.0, + "step": 8242 + }, + { + "epoch": 0.9052273226444103, + "grad_norm": 2.2854318618774414, + "learning_rate": 1e-06, + "loss": 0.9247, + "mean_token_accuracy": 0.7186179161071777, + "num_tokens": 208869230.0, + "step": 8243 + }, + { + "epoch": 0.905337140347024, + "grad_norm": 2.549281358718872, + "learning_rate": 1e-06, + "loss": 0.8889, + "mean_token_accuracy": 0.720780074596405, + "num_tokens": 208889328.0, + "step": 8244 + }, + { + "epoch": 0.9054469580496376, + "grad_norm": 2.544518232345581, + "learning_rate": 1e-06, + "loss": 0.9944, + "mean_token_accuracy": 0.6984785795211792, + "num_tokens": 208910410.0, + "step": 8245 + }, + { + "epoch": 0.9055567757522512, + "grad_norm": 2.130648136138916, + "learning_rate": 1e-06, + "loss": 1.0004, + "mean_token_accuracy": 0.6951835751533508, + "num_tokens": 208937369.0, + "step": 8246 + }, + { + "epoch": 0.9056665934548649, + "grad_norm": 2.158766508102417, + "learning_rate": 1e-06, + "loss": 0.8578, + "mean_token_accuracy": 0.7417142987251282, + "num_tokens": 208963451.0, + "step": 8247 + }, + { + "epoch": 0.9057764111574785, + "grad_norm": 2.1884658336639404, + "learning_rate": 1e-06, + "loss": 0.9213, + "mean_token_accuracy": 0.7166489958763123, + "num_tokens": 208988700.0, + "step": 8248 + }, + { + "epoch": 0.9058862288600923, + "grad_norm": 2.359058380126953, + "learning_rate": 1e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7236984968185425, + "num_tokens": 209009896.0, + "step": 8249 + }, + { + "epoch": 0.9059960465627059, + "grad_norm": 2.3367738723754883, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7029229402542114, + "num_tokens": 209032113.0, + "step": 8250 + }, + { + "epoch": 0.9061058642653196, + "grad_norm": 2.1103274822235107, + "learning_rate": 1e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.7182117700576782, + "num_tokens": 209059845.0, + "step": 8251 + }, + { + "epoch": 0.9062156819679332, + "grad_norm": 2.2288129329681396, + "learning_rate": 1e-06, + "loss": 0.9817, + "mean_token_accuracy": 0.7099747657775879, + "num_tokens": 209083382.0, + "step": 8252 + }, + { + "epoch": 0.9063254996705469, + "grad_norm": 2.0536623001098633, + "learning_rate": 1e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.7006312608718872, + "num_tokens": 209113010.0, + "step": 8253 + }, + { + "epoch": 0.9064353173731605, + "grad_norm": 2.2084343433380127, + "learning_rate": 1e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.6906019449234009, + "num_tokens": 209138955.0, + "step": 8254 + }, + { + "epoch": 0.9065451350757742, + "grad_norm": 2.0034019947052, + "learning_rate": 1e-06, + "loss": 1.0506, + "mean_token_accuracy": 0.6858310103416443, + "num_tokens": 209168592.0, + "step": 8255 + }, + { + "epoch": 0.9066549527783879, + "grad_norm": 2.0579328536987305, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7107693552970886, + "num_tokens": 209193517.0, + "step": 8256 + }, + { + "epoch": 0.9067647704810016, + "grad_norm": 2.4195291996002197, + "learning_rate": 1e-06, + "loss": 0.8967, + "mean_token_accuracy": 0.722603976726532, + "num_tokens": 209214680.0, + "step": 8257 + }, + { + "epoch": 0.9068745881836152, + "grad_norm": 1.9983892440795898, + "learning_rate": 1e-06, + "loss": 0.9474, + "mean_token_accuracy": 0.7050795555114746, + "num_tokens": 209242403.0, + "step": 8258 + }, + { + "epoch": 0.9069844058862289, + "grad_norm": 2.5727272033691406, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.7160311937332153, + "num_tokens": 209262141.0, + "step": 8259 + }, + { + "epoch": 0.9070942235888425, + "grad_norm": 2.5690135955810547, + "learning_rate": 1e-06, + "loss": 0.87, + "mean_token_accuracy": 0.7290627360343933, + "num_tokens": 209281832.0, + "step": 8260 + }, + { + "epoch": 0.9072040412914562, + "grad_norm": 2.2746386528015137, + "learning_rate": 1e-06, + "loss": 0.8867, + "mean_token_accuracy": 0.7220151424407959, + "num_tokens": 209302827.0, + "step": 8261 + }, + { + "epoch": 0.9073138589940698, + "grad_norm": 2.381168842315674, + "learning_rate": 1e-06, + "loss": 0.8403, + "mean_token_accuracy": 0.7339710593223572, + "num_tokens": 209324515.0, + "step": 8262 + }, + { + "epoch": 0.9074236766966836, + "grad_norm": 2.1014065742492676, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.7091602087020874, + "num_tokens": 209350440.0, + "step": 8263 + }, + { + "epoch": 0.9075334943992972, + "grad_norm": 2.2749993801116943, + "learning_rate": 1e-06, + "loss": 0.7899, + "mean_token_accuracy": 0.7504764795303345, + "num_tokens": 209372941.0, + "step": 8264 + }, + { + "epoch": 0.9076433121019108, + "grad_norm": 2.0009636878967285, + "learning_rate": 1e-06, + "loss": 1.1502, + "mean_token_accuracy": 0.6672115325927734, + "num_tokens": 209402903.0, + "step": 8265 + }, + { + "epoch": 0.9077531298045245, + "grad_norm": 2.301065444946289, + "learning_rate": 1e-06, + "loss": 0.9837, + "mean_token_accuracy": 0.7014982104301453, + "num_tokens": 209426578.0, + "step": 8266 + }, + { + "epoch": 0.9078629475071381, + "grad_norm": 1.8636311292648315, + "learning_rate": 1e-06, + "loss": 0.9987, + "mean_token_accuracy": 0.6933556199073792, + "num_tokens": 209460763.0, + "step": 8267 + }, + { + "epoch": 0.9079727652097518, + "grad_norm": 2.353449821472168, + "learning_rate": 1e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.7046902179718018, + "num_tokens": 209482932.0, + "step": 8268 + }, + { + "epoch": 0.9080825829123654, + "grad_norm": 2.3177685737609863, + "learning_rate": 1e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.7297688722610474, + "num_tokens": 209505794.0, + "step": 8269 + }, + { + "epoch": 0.9081924006149792, + "grad_norm": 2.3281147480010986, + "learning_rate": 1e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.7113636136054993, + "num_tokens": 209528383.0, + "step": 8270 + }, + { + "epoch": 0.9083022183175928, + "grad_norm": 2.093380928039551, + "learning_rate": 1e-06, + "loss": 1.0167, + "mean_token_accuracy": 0.7007295489311218, + "num_tokens": 209556623.0, + "step": 8271 + }, + { + "epoch": 0.9084120360202065, + "grad_norm": 2.1641898155212402, + "learning_rate": 1e-06, + "loss": 0.9871, + "mean_token_accuracy": 0.6964931488037109, + "num_tokens": 209583432.0, + "step": 8272 + }, + { + "epoch": 0.9085218537228201, + "grad_norm": 2.092501401901245, + "learning_rate": 1e-06, + "loss": 0.946, + "mean_token_accuracy": 0.7061747312545776, + "num_tokens": 209609425.0, + "step": 8273 + }, + { + "epoch": 0.9086316714254338, + "grad_norm": 2.204219102859497, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7206573486328125, + "num_tokens": 209633413.0, + "step": 8274 + }, + { + "epoch": 0.9087414891280474, + "grad_norm": 2.0456793308258057, + "learning_rate": 1e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.7124581336975098, + "num_tokens": 209662364.0, + "step": 8275 + }, + { + "epoch": 0.9088513068306611, + "grad_norm": 2.0270020961761475, + "learning_rate": 1e-06, + "loss": 0.9974, + "mean_token_accuracy": 0.6959104537963867, + "num_tokens": 209693834.0, + "step": 8276 + }, + { + "epoch": 0.9089611245332747, + "grad_norm": 2.3285272121429443, + "learning_rate": 1e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.7207818627357483, + "num_tokens": 209716844.0, + "step": 8277 + }, + { + "epoch": 0.9090709422358885, + "grad_norm": 2.060767412185669, + "learning_rate": 1e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.711966335773468, + "num_tokens": 209745552.0, + "step": 8278 + }, + { + "epoch": 0.9091807599385021, + "grad_norm": 2.4622011184692383, + "learning_rate": 1e-06, + "loss": 0.9045, + "mean_token_accuracy": 0.7195755243301392, + "num_tokens": 209767859.0, + "step": 8279 + }, + { + "epoch": 0.9092905776411158, + "grad_norm": 2.3467977046966553, + "learning_rate": 1e-06, + "loss": 0.992, + "mean_token_accuracy": 0.7072625160217285, + "num_tokens": 209791065.0, + "step": 8280 + }, + { + "epoch": 0.9094003953437294, + "grad_norm": 2.1144697666168213, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.7041378021240234, + "num_tokens": 209818792.0, + "step": 8281 + }, + { + "epoch": 0.909510213046343, + "grad_norm": 1.949949026107788, + "learning_rate": 1e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.7033877372741699, + "num_tokens": 209847493.0, + "step": 8282 + }, + { + "epoch": 0.9096200307489567, + "grad_norm": 2.5818989276885986, + "learning_rate": 1e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.7092574238777161, + "num_tokens": 209866583.0, + "step": 8283 + }, + { + "epoch": 0.9097298484515703, + "grad_norm": 2.1810898780822754, + "learning_rate": 1e-06, + "loss": 0.9853, + "mean_token_accuracy": 0.6949862837791443, + "num_tokens": 209892782.0, + "step": 8284 + }, + { + "epoch": 0.9098396661541841, + "grad_norm": 2.541085720062256, + "learning_rate": 1e-06, + "loss": 0.877, + "mean_token_accuracy": 0.722957968711853, + "num_tokens": 209913321.0, + "step": 8285 + }, + { + "epoch": 0.9099494838567977, + "grad_norm": 2.5776097774505615, + "learning_rate": 1e-06, + "loss": 0.7633, + "mean_token_accuracy": 0.7525632977485657, + "num_tokens": 209931506.0, + "step": 8286 + }, + { + "epoch": 0.9100593015594114, + "grad_norm": 1.9576473236083984, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7044255137443542, + "num_tokens": 209961662.0, + "step": 8287 + }, + { + "epoch": 0.910169119262025, + "grad_norm": 2.06284761428833, + "learning_rate": 1e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.6943916082382202, + "num_tokens": 209988784.0, + "step": 8288 + }, + { + "epoch": 0.9102789369646387, + "grad_norm": 2.2704575061798096, + "learning_rate": 1e-06, + "loss": 0.897, + "mean_token_accuracy": 0.7229816317558289, + "num_tokens": 210011812.0, + "step": 8289 + }, + { + "epoch": 0.9103887546672523, + "grad_norm": 2.0778119564056396, + "learning_rate": 1e-06, + "loss": 0.9068, + "mean_token_accuracy": 0.7193264961242676, + "num_tokens": 210039005.0, + "step": 8290 + }, + { + "epoch": 0.910498572369866, + "grad_norm": 2.374138355255127, + "learning_rate": 1e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.7138741612434387, + "num_tokens": 210061098.0, + "step": 8291 + }, + { + "epoch": 0.9106083900724797, + "grad_norm": 2.0731797218322754, + "learning_rate": 1e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7154937386512756, + "num_tokens": 210087872.0, + "step": 8292 + }, + { + "epoch": 0.9107182077750934, + "grad_norm": 2.2176716327667236, + "learning_rate": 1e-06, + "loss": 1.0599, + "mean_token_accuracy": 0.6775726079940796, + "num_tokens": 210115297.0, + "step": 8293 + }, + { + "epoch": 0.910828025477707, + "grad_norm": 2.3039395809173584, + "learning_rate": 1e-06, + "loss": 0.954, + "mean_token_accuracy": 0.7117676734924316, + "num_tokens": 210139731.0, + "step": 8294 + }, + { + "epoch": 0.9109378431803207, + "grad_norm": 1.9840519428253174, + "learning_rate": 1e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.6942176818847656, + "num_tokens": 210168350.0, + "step": 8295 + }, + { + "epoch": 0.9110476608829343, + "grad_norm": 2.125708818435669, + "learning_rate": 1e-06, + "loss": 0.9792, + "mean_token_accuracy": 0.7050235271453857, + "num_tokens": 210193973.0, + "step": 8296 + }, + { + "epoch": 0.911157478585548, + "grad_norm": 2.4337081909179688, + "learning_rate": 1e-06, + "loss": 0.9071, + "mean_token_accuracy": 0.7207794189453125, + "num_tokens": 210214093.0, + "step": 8297 + }, + { + "epoch": 0.9112672962881616, + "grad_norm": 3.11607027053833, + "learning_rate": 1e-06, + "loss": 0.928, + "mean_token_accuracy": 0.7125968337059021, + "num_tokens": 210228819.0, + "step": 8298 + }, + { + "epoch": 0.9113771139907754, + "grad_norm": 2.239227533340454, + "learning_rate": 1e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7205638289451599, + "num_tokens": 210252229.0, + "step": 8299 + }, + { + "epoch": 0.911486931693389, + "grad_norm": 2.7093892097473145, + "learning_rate": 1e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.7128543853759766, + "num_tokens": 210271957.0, + "step": 8300 + }, + { + "epoch": 0.9115967493960027, + "grad_norm": 2.2057790756225586, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7129600048065186, + "num_tokens": 210298445.0, + "step": 8301 + }, + { + "epoch": 0.9117065670986163, + "grad_norm": 2.1351842880249023, + "learning_rate": 1e-06, + "loss": 0.9924, + "mean_token_accuracy": 0.6974723935127258, + "num_tokens": 210324913.0, + "step": 8302 + }, + { + "epoch": 0.91181638480123, + "grad_norm": 2.303226947784424, + "learning_rate": 1e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7151580452919006, + "num_tokens": 210346704.0, + "step": 8303 + }, + { + "epoch": 0.9119262025038436, + "grad_norm": 2.0166356563568115, + "learning_rate": 1e-06, + "loss": 1.0389, + "mean_token_accuracy": 0.6822280287742615, + "num_tokens": 210377629.0, + "step": 8304 + }, + { + "epoch": 0.9120360202064572, + "grad_norm": 2.1542937755584717, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.7041499614715576, + "num_tokens": 210405793.0, + "step": 8305 + }, + { + "epoch": 0.9121458379090709, + "grad_norm": 2.1534156799316406, + "learning_rate": 1e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.7235351800918579, + "num_tokens": 210430934.0, + "step": 8306 + }, + { + "epoch": 0.9122556556116846, + "grad_norm": 2.1552908420562744, + "learning_rate": 1e-06, + "loss": 0.9183, + "mean_token_accuracy": 0.7241525053977966, + "num_tokens": 210454562.0, + "step": 8307 + }, + { + "epoch": 0.9123654733142983, + "grad_norm": 2.085394859313965, + "learning_rate": 1e-06, + "loss": 1.0179, + "mean_token_accuracy": 0.6948525309562683, + "num_tokens": 210483030.0, + "step": 8308 + }, + { + "epoch": 0.9124752910169119, + "grad_norm": 1.9382350444793701, + "learning_rate": 1e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.7147355675697327, + "num_tokens": 210514034.0, + "step": 8309 + }, + { + "epoch": 0.9125851087195256, + "grad_norm": 2.5751359462738037, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.7101609110832214, + "num_tokens": 210533967.0, + "step": 8310 + }, + { + "epoch": 0.9126949264221392, + "grad_norm": 2.0687451362609863, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.6979496479034424, + "num_tokens": 210563036.0, + "step": 8311 + }, + { + "epoch": 0.9128047441247529, + "grad_norm": 2.1119558811187744, + "learning_rate": 1e-06, + "loss": 1.0217, + "mean_token_accuracy": 0.6915841102600098, + "num_tokens": 210592771.0, + "step": 8312 + }, + { + "epoch": 0.9129145618273665, + "grad_norm": 2.297999620437622, + "learning_rate": 1e-06, + "loss": 0.8964, + "mean_token_accuracy": 0.7255153656005859, + "num_tokens": 210614863.0, + "step": 8313 + }, + { + "epoch": 0.9130243795299803, + "grad_norm": 2.163468360900879, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7093268036842346, + "num_tokens": 210640249.0, + "step": 8314 + }, + { + "epoch": 0.9131341972325939, + "grad_norm": 2.156172752380371, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7073447108268738, + "num_tokens": 210666994.0, + "step": 8315 + }, + { + "epoch": 0.9132440149352076, + "grad_norm": 2.228707790374756, + "learning_rate": 1e-06, + "loss": 1.0308, + "mean_token_accuracy": 0.6877971887588501, + "num_tokens": 210694551.0, + "step": 8316 + }, + { + "epoch": 0.9133538326378212, + "grad_norm": 2.1191256046295166, + "learning_rate": 1e-06, + "loss": 1.0106, + "mean_token_accuracy": 0.6902220249176025, + "num_tokens": 210722500.0, + "step": 8317 + }, + { + "epoch": 0.9134636503404349, + "grad_norm": 2.0050714015960693, + "learning_rate": 1e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7110464572906494, + "num_tokens": 210750201.0, + "step": 8318 + }, + { + "epoch": 0.9135734680430485, + "grad_norm": 2.0660552978515625, + "learning_rate": 1e-06, + "loss": 1.0465, + "mean_token_accuracy": 0.6805782914161682, + "num_tokens": 210778661.0, + "step": 8319 + }, + { + "epoch": 0.9136832857456622, + "grad_norm": 2.3104937076568604, + "learning_rate": 1e-06, + "loss": 0.946, + "mean_token_accuracy": 0.7100464105606079, + "num_tokens": 210801911.0, + "step": 8320 + }, + { + "epoch": 0.9137931034482759, + "grad_norm": 2.281283140182495, + "learning_rate": 1e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.7076125144958496, + "num_tokens": 210825781.0, + "step": 8321 + }, + { + "epoch": 0.9139029211508896, + "grad_norm": 1.9927148818969727, + "learning_rate": 1e-06, + "loss": 0.9497, + "mean_token_accuracy": 0.7190102338790894, + "num_tokens": 210854201.0, + "step": 8322 + }, + { + "epoch": 0.9140127388535032, + "grad_norm": 2.2951416969299316, + "learning_rate": 1e-06, + "loss": 0.8716, + "mean_token_accuracy": 0.7298396825790405, + "num_tokens": 210878040.0, + "step": 8323 + }, + { + "epoch": 0.9141225565561168, + "grad_norm": 2.2857582569122314, + "learning_rate": 1e-06, + "loss": 0.8743, + "mean_token_accuracy": 0.7348787784576416, + "num_tokens": 210900507.0, + "step": 8324 + }, + { + "epoch": 0.9142323742587305, + "grad_norm": 2.0781545639038086, + "learning_rate": 1e-06, + "loss": 0.8845, + "mean_token_accuracy": 0.7198364734649658, + "num_tokens": 210928678.0, + "step": 8325 + }, + { + "epoch": 0.9143421919613441, + "grad_norm": 2.170642614364624, + "learning_rate": 1e-06, + "loss": 1.0367, + "mean_token_accuracy": 0.6910656094551086, + "num_tokens": 210956247.0, + "step": 8326 + }, + { + "epoch": 0.9144520096639578, + "grad_norm": 2.0944831371307373, + "learning_rate": 1e-06, + "loss": 1.0557, + "mean_token_accuracy": 0.6823545694351196, + "num_tokens": 210983744.0, + "step": 8327 + }, + { + "epoch": 0.9145618273665715, + "grad_norm": 2.5778698921203613, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.7155738472938538, + "num_tokens": 211003329.0, + "step": 8328 + }, + { + "epoch": 0.9146716450691852, + "grad_norm": 2.012993097305298, + "learning_rate": 1e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.6904862523078918, + "num_tokens": 211032777.0, + "step": 8329 + }, + { + "epoch": 0.9147814627717988, + "grad_norm": 2.092611312866211, + "learning_rate": 1e-06, + "loss": 0.9628, + "mean_token_accuracy": 0.703925371170044, + "num_tokens": 211059874.0, + "step": 8330 + }, + { + "epoch": 0.9148912804744125, + "grad_norm": 2.242656707763672, + "learning_rate": 1e-06, + "loss": 0.966, + "mean_token_accuracy": 0.7023742198944092, + "num_tokens": 211085960.0, + "step": 8331 + }, + { + "epoch": 0.9150010981770261, + "grad_norm": 1.9806485176086426, + "learning_rate": 1e-06, + "loss": 0.9977, + "mean_token_accuracy": 0.6972851753234863, + "num_tokens": 211115783.0, + "step": 8332 + }, + { + "epoch": 0.9151109158796398, + "grad_norm": 1.911534070968628, + "learning_rate": 1e-06, + "loss": 0.9236, + "mean_token_accuracy": 0.713802695274353, + "num_tokens": 211145032.0, + "step": 8333 + }, + { + "epoch": 0.9152207335822534, + "grad_norm": 2.149191379547119, + "learning_rate": 1e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.724083662033081, + "num_tokens": 211168752.0, + "step": 8334 + }, + { + "epoch": 0.9153305512848671, + "grad_norm": 1.915370225906372, + "learning_rate": 1e-06, + "loss": 1.0344, + "mean_token_accuracy": 0.6857282519340515, + "num_tokens": 211201186.0, + "step": 8335 + }, + { + "epoch": 0.9154403689874808, + "grad_norm": 2.2618439197540283, + "learning_rate": 1e-06, + "loss": 0.9902, + "mean_token_accuracy": 0.6962285041809082, + "num_tokens": 211225269.0, + "step": 8336 + }, + { + "epoch": 0.9155501866900945, + "grad_norm": 2.5423412322998047, + "learning_rate": 1e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.7087090015411377, + "num_tokens": 211245863.0, + "step": 8337 + }, + { + "epoch": 0.9156600043927081, + "grad_norm": 1.9176687002182007, + "learning_rate": 1e-06, + "loss": 0.9776, + "mean_token_accuracy": 0.7021472454071045, + "num_tokens": 211277587.0, + "step": 8338 + }, + { + "epoch": 0.9157698220953218, + "grad_norm": 2.401218891143799, + "learning_rate": 1e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.6994249224662781, + "num_tokens": 211300070.0, + "step": 8339 + }, + { + "epoch": 0.9158796397979354, + "grad_norm": 1.8430474996566772, + "learning_rate": 1e-06, + "loss": 1.0126, + "mean_token_accuracy": 0.6989836096763611, + "num_tokens": 211336039.0, + "step": 8340 + }, + { + "epoch": 0.915989457500549, + "grad_norm": 2.0213570594787598, + "learning_rate": 1e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.7055234909057617, + "num_tokens": 211365856.0, + "step": 8341 + }, + { + "epoch": 0.9160992752031627, + "grad_norm": 1.9606478214263916, + "learning_rate": 1e-06, + "loss": 1.0171, + "mean_token_accuracy": 0.6927164793014526, + "num_tokens": 211395160.0, + "step": 8342 + }, + { + "epoch": 0.9162090929057765, + "grad_norm": 2.109917640686035, + "learning_rate": 1e-06, + "loss": 1.0426, + "mean_token_accuracy": 0.6868923306465149, + "num_tokens": 211423294.0, + "step": 8343 + }, + { + "epoch": 0.9163189106083901, + "grad_norm": 2.0409183502197266, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.704602837562561, + "num_tokens": 211452480.0, + "step": 8344 + }, + { + "epoch": 0.9164287283110037, + "grad_norm": 2.206738233566284, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.6994358897209167, + "num_tokens": 211476370.0, + "step": 8345 + }, + { + "epoch": 0.9165385460136174, + "grad_norm": 2.3099896907806396, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7210462093353271, + "num_tokens": 211498348.0, + "step": 8346 + }, + { + "epoch": 0.916648363716231, + "grad_norm": 2.288677930831909, + "learning_rate": 1e-06, + "loss": 1.0023, + "mean_token_accuracy": 0.6900979280471802, + "num_tokens": 211523778.0, + "step": 8347 + }, + { + "epoch": 0.9167581814188447, + "grad_norm": 2.1160054206848145, + "learning_rate": 1e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.7134475708007812, + "num_tokens": 211549242.0, + "step": 8348 + }, + { + "epoch": 0.9168679991214583, + "grad_norm": 2.127178192138672, + "learning_rate": 1e-06, + "loss": 0.9413, + "mean_token_accuracy": 0.713011622428894, + "num_tokens": 211576610.0, + "step": 8349 + }, + { + "epoch": 0.9169778168240721, + "grad_norm": 2.4001541137695312, + "learning_rate": 1e-06, + "loss": 0.9493, + "mean_token_accuracy": 0.7074156999588013, + "num_tokens": 211598591.0, + "step": 8350 + }, + { + "epoch": 0.9170876345266857, + "grad_norm": 2.020350456237793, + "learning_rate": 1e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.7121116518974304, + "num_tokens": 211629969.0, + "step": 8351 + }, + { + "epoch": 0.9171974522292994, + "grad_norm": 2.4002273082733154, + "learning_rate": 1e-06, + "loss": 1.0022, + "mean_token_accuracy": 0.694731593132019, + "num_tokens": 211651547.0, + "step": 8352 + }, + { + "epoch": 0.917307269931913, + "grad_norm": 2.538645029067993, + "learning_rate": 1e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.7191619873046875, + "num_tokens": 211672004.0, + "step": 8353 + }, + { + "epoch": 0.9174170876345267, + "grad_norm": 2.3758018016815186, + "learning_rate": 1e-06, + "loss": 0.9823, + "mean_token_accuracy": 0.6976135969161987, + "num_tokens": 211694346.0, + "step": 8354 + }, + { + "epoch": 0.9175269053371403, + "grad_norm": 2.453977584838867, + "learning_rate": 1e-06, + "loss": 0.8226, + "mean_token_accuracy": 0.7446959018707275, + "num_tokens": 211713649.0, + "step": 8355 + }, + { + "epoch": 0.917636723039754, + "grad_norm": 2.2231557369232178, + "learning_rate": 1e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.7192597389221191, + "num_tokens": 211739194.0, + "step": 8356 + }, + { + "epoch": 0.9177465407423677, + "grad_norm": 1.9320573806762695, + "learning_rate": 1e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.7135557532310486, + "num_tokens": 211770128.0, + "step": 8357 + }, + { + "epoch": 0.9178563584449814, + "grad_norm": 2.2948713302612305, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7245577573776245, + "num_tokens": 211793290.0, + "step": 8358 + }, + { + "epoch": 0.917966176147595, + "grad_norm": 1.962253212928772, + "learning_rate": 1e-06, + "loss": 1.0189, + "mean_token_accuracy": 0.6978736519813538, + "num_tokens": 211822253.0, + "step": 8359 + }, + { + "epoch": 0.9180759938502087, + "grad_norm": 2.47578501701355, + "learning_rate": 1e-06, + "loss": 0.9132, + "mean_token_accuracy": 0.7228274941444397, + "num_tokens": 211844285.0, + "step": 8360 + }, + { + "epoch": 0.9181858115528223, + "grad_norm": 2.2303783893585205, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.7075978517532349, + "num_tokens": 211870009.0, + "step": 8361 + }, + { + "epoch": 0.918295629255436, + "grad_norm": 2.283451557159424, + "learning_rate": 1e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7072824239730835, + "num_tokens": 211892318.0, + "step": 8362 + }, + { + "epoch": 0.9184054469580496, + "grad_norm": 2.431173801422119, + "learning_rate": 1e-06, + "loss": 0.8887, + "mean_token_accuracy": 0.7319172024726868, + "num_tokens": 211911983.0, + "step": 8363 + }, + { + "epoch": 0.9185152646606632, + "grad_norm": 2.459125518798828, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7132495045661926, + "num_tokens": 211932132.0, + "step": 8364 + }, + { + "epoch": 0.918625082363277, + "grad_norm": 2.293625593185425, + "learning_rate": 1e-06, + "loss": 0.9352, + "mean_token_accuracy": 0.708723783493042, + "num_tokens": 211953837.0, + "step": 8365 + }, + { + "epoch": 0.9187349000658906, + "grad_norm": 2.215698719024658, + "learning_rate": 1e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.7121503949165344, + "num_tokens": 211977416.0, + "step": 8366 + }, + { + "epoch": 0.9188447177685043, + "grad_norm": 2.7580196857452393, + "learning_rate": 1e-06, + "loss": 0.8762, + "mean_token_accuracy": 0.7184451222419739, + "num_tokens": 211994688.0, + "step": 8367 + }, + { + "epoch": 0.9189545354711179, + "grad_norm": 2.0089921951293945, + "learning_rate": 1e-06, + "loss": 0.9743, + "mean_token_accuracy": 0.6999759674072266, + "num_tokens": 212023621.0, + "step": 8368 + }, + { + "epoch": 0.9190643531737316, + "grad_norm": 2.18913197517395, + "learning_rate": 1e-06, + "loss": 0.8388, + "mean_token_accuracy": 0.7337790131568909, + "num_tokens": 212047676.0, + "step": 8369 + }, + { + "epoch": 0.9191741708763452, + "grad_norm": 1.8924845457077026, + "learning_rate": 1e-06, + "loss": 1.081, + "mean_token_accuracy": 0.6775421500205994, + "num_tokens": 212079528.0, + "step": 8370 + }, + { + "epoch": 0.9192839885789589, + "grad_norm": 2.347898006439209, + "learning_rate": 1e-06, + "loss": 0.7797, + "mean_token_accuracy": 0.7473992109298706, + "num_tokens": 212099955.0, + "step": 8371 + }, + { + "epoch": 0.9193938062815726, + "grad_norm": 2.14239501953125, + "learning_rate": 1e-06, + "loss": 0.949, + "mean_token_accuracy": 0.7085179090499878, + "num_tokens": 212124786.0, + "step": 8372 + }, + { + "epoch": 0.9195036239841863, + "grad_norm": 2.089761257171631, + "learning_rate": 1e-06, + "loss": 0.8633, + "mean_token_accuracy": 0.7356300354003906, + "num_tokens": 212151031.0, + "step": 8373 + }, + { + "epoch": 0.9196134416867999, + "grad_norm": 2.438244104385376, + "learning_rate": 1e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.7170281410217285, + "num_tokens": 212171466.0, + "step": 8374 + }, + { + "epoch": 0.9197232593894136, + "grad_norm": 2.3496665954589844, + "learning_rate": 1e-06, + "loss": 0.8706, + "mean_token_accuracy": 0.7265720367431641, + "num_tokens": 212193210.0, + "step": 8375 + }, + { + "epoch": 0.9198330770920272, + "grad_norm": 1.9215824604034424, + "learning_rate": 1e-06, + "loss": 1.0194, + "mean_token_accuracy": 0.6929461359977722, + "num_tokens": 212225809.0, + "step": 8376 + }, + { + "epoch": 0.9199428947946409, + "grad_norm": 2.4346885681152344, + "learning_rate": 1e-06, + "loss": 1.0051, + "mean_token_accuracy": 0.700534999370575, + "num_tokens": 212247863.0, + "step": 8377 + }, + { + "epoch": 0.9200527124972545, + "grad_norm": 1.972886562347412, + "learning_rate": 1e-06, + "loss": 1.0387, + "mean_token_accuracy": 0.6899628639221191, + "num_tokens": 212280721.0, + "step": 8378 + }, + { + "epoch": 0.9201625301998683, + "grad_norm": 2.5632286071777344, + "learning_rate": 1e-06, + "loss": 0.8739, + "mean_token_accuracy": 0.7274787425994873, + "num_tokens": 212300201.0, + "step": 8379 + }, + { + "epoch": 0.9202723479024819, + "grad_norm": 2.093803644180298, + "learning_rate": 1e-06, + "loss": 1.004, + "mean_token_accuracy": 0.6939924359321594, + "num_tokens": 212326094.0, + "step": 8380 + }, + { + "epoch": 0.9203821656050956, + "grad_norm": 2.0888078212738037, + "learning_rate": 1e-06, + "loss": 0.8831, + "mean_token_accuracy": 0.7229408025741577, + "num_tokens": 212351510.0, + "step": 8381 + }, + { + "epoch": 0.9204919833077092, + "grad_norm": 2.359062433242798, + "learning_rate": 1e-06, + "loss": 0.9041, + "mean_token_accuracy": 0.7233198285102844, + "num_tokens": 212374819.0, + "step": 8382 + }, + { + "epoch": 0.9206018010103229, + "grad_norm": 2.3528597354888916, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.7036160230636597, + "num_tokens": 212397821.0, + "step": 8383 + }, + { + "epoch": 0.9207116187129365, + "grad_norm": 2.16884446144104, + "learning_rate": 1e-06, + "loss": 0.8914, + "mean_token_accuracy": 0.7215202450752258, + "num_tokens": 212422337.0, + "step": 8384 + }, + { + "epoch": 0.9208214364155501, + "grad_norm": 2.0867605209350586, + "learning_rate": 1e-06, + "loss": 0.8501, + "mean_token_accuracy": 0.732209324836731, + "num_tokens": 212449502.0, + "step": 8385 + }, + { + "epoch": 0.9209312541181639, + "grad_norm": 2.2147090435028076, + "learning_rate": 1e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.70697021484375, + "num_tokens": 212474016.0, + "step": 8386 + }, + { + "epoch": 0.9210410718207775, + "grad_norm": 2.111149787902832, + "learning_rate": 1e-06, + "loss": 0.9007, + "mean_token_accuracy": 0.7171719670295715, + "num_tokens": 212501711.0, + "step": 8387 + }, + { + "epoch": 0.9211508895233912, + "grad_norm": 2.4476358890533447, + "learning_rate": 1e-06, + "loss": 0.9871, + "mean_token_accuracy": 0.7027775645256042, + "num_tokens": 212523310.0, + "step": 8388 + }, + { + "epoch": 0.9212607072260048, + "grad_norm": 1.9523624181747437, + "learning_rate": 1e-06, + "loss": 0.866, + "mean_token_accuracy": 0.73557448387146, + "num_tokens": 212551473.0, + "step": 8389 + }, + { + "epoch": 0.9213705249286185, + "grad_norm": 2.2211811542510986, + "learning_rate": 1e-06, + "loss": 0.8775, + "mean_token_accuracy": 0.7261942625045776, + "num_tokens": 212575588.0, + "step": 8390 + }, + { + "epoch": 0.9214803426312321, + "grad_norm": 2.070359706878662, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7108289003372192, + "num_tokens": 212602989.0, + "step": 8391 + }, + { + "epoch": 0.9215901603338458, + "grad_norm": 2.18314266204834, + "learning_rate": 1e-06, + "loss": 0.8799, + "mean_token_accuracy": 0.733045756816864, + "num_tokens": 212627258.0, + "step": 8392 + }, + { + "epoch": 0.9216999780364594, + "grad_norm": 2.1647064685821533, + "learning_rate": 1e-06, + "loss": 0.8502, + "mean_token_accuracy": 0.7292402982711792, + "num_tokens": 212651507.0, + "step": 8393 + }, + { + "epoch": 0.9218097957390732, + "grad_norm": 2.1329259872436523, + "learning_rate": 1e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.710003137588501, + "num_tokens": 212679758.0, + "step": 8394 + }, + { + "epoch": 0.9219196134416868, + "grad_norm": 2.262380361557007, + "learning_rate": 1e-06, + "loss": 0.8976, + "mean_token_accuracy": 0.7233411073684692, + "num_tokens": 212701893.0, + "step": 8395 + }, + { + "epoch": 0.9220294311443005, + "grad_norm": 2.160322904586792, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.7113236784934998, + "num_tokens": 212729544.0, + "step": 8396 + }, + { + "epoch": 0.9221392488469141, + "grad_norm": 2.0744128227233887, + "learning_rate": 1e-06, + "loss": 0.929, + "mean_token_accuracy": 0.7067160606384277, + "num_tokens": 212757772.0, + "step": 8397 + }, + { + "epoch": 0.9222490665495278, + "grad_norm": 2.2992098331451416, + "learning_rate": 1e-06, + "loss": 0.8745, + "mean_token_accuracy": 0.7267606854438782, + "num_tokens": 212779343.0, + "step": 8398 + }, + { + "epoch": 0.9223588842521414, + "grad_norm": 2.2441844940185547, + "learning_rate": 1e-06, + "loss": 0.9966, + "mean_token_accuracy": 0.6951055526733398, + "num_tokens": 212805022.0, + "step": 8399 + }, + { + "epoch": 0.922468701954755, + "grad_norm": 1.8974603414535522, + "learning_rate": 1e-06, + "loss": 1.0171, + "mean_token_accuracy": 0.6941920518875122, + "num_tokens": 212837556.0, + "step": 8400 + }, + { + "epoch": 0.9225785196573688, + "grad_norm": 2.0211167335510254, + "learning_rate": 1e-06, + "loss": 1.0155, + "mean_token_accuracy": 0.6898216009140015, + "num_tokens": 212865733.0, + "step": 8401 + }, + { + "epoch": 0.9226883373599825, + "grad_norm": 2.0790998935699463, + "learning_rate": 1e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.699661374092102, + "num_tokens": 212895912.0, + "step": 8402 + }, + { + "epoch": 0.9227981550625961, + "grad_norm": 2.1091256141662598, + "learning_rate": 1e-06, + "loss": 0.9034, + "mean_token_accuracy": 0.7169312238693237, + "num_tokens": 212920141.0, + "step": 8403 + }, + { + "epoch": 0.9229079727652097, + "grad_norm": 2.0908870697021484, + "learning_rate": 1e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.7029228210449219, + "num_tokens": 212947938.0, + "step": 8404 + }, + { + "epoch": 0.9230177904678234, + "grad_norm": 2.262097120285034, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.7034612894058228, + "num_tokens": 212972963.0, + "step": 8405 + }, + { + "epoch": 0.923127608170437, + "grad_norm": 2.08422589302063, + "learning_rate": 1e-06, + "loss": 0.929, + "mean_token_accuracy": 0.7187792062759399, + "num_tokens": 212999029.0, + "step": 8406 + }, + { + "epoch": 0.9232374258730507, + "grad_norm": 2.0915911197662354, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.7029191851615906, + "num_tokens": 213024666.0, + "step": 8407 + }, + { + "epoch": 0.9233472435756644, + "grad_norm": 2.2250139713287354, + "learning_rate": 1e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7160993814468384, + "num_tokens": 213049462.0, + "step": 8408 + }, + { + "epoch": 0.9234570612782781, + "grad_norm": 2.404496192932129, + "learning_rate": 1e-06, + "loss": 0.8653, + "mean_token_accuracy": 0.7341319918632507, + "num_tokens": 213070237.0, + "step": 8409 + }, + { + "epoch": 0.9235668789808917, + "grad_norm": 2.017951250076294, + "learning_rate": 1e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.6981270909309387, + "num_tokens": 213098251.0, + "step": 8410 + }, + { + "epoch": 0.9236766966835054, + "grad_norm": 2.100884199142456, + "learning_rate": 1e-06, + "loss": 1.0253, + "mean_token_accuracy": 0.6932510733604431, + "num_tokens": 213126427.0, + "step": 8411 + }, + { + "epoch": 0.923786514386119, + "grad_norm": 2.1300952434539795, + "learning_rate": 1e-06, + "loss": 1.0431, + "mean_token_accuracy": 0.6854280233383179, + "num_tokens": 213157269.0, + "step": 8412 + }, + { + "epoch": 0.9238963320887327, + "grad_norm": 2.27601957321167, + "learning_rate": 1e-06, + "loss": 0.8671, + "mean_token_accuracy": 0.7304049730300903, + "num_tokens": 213181048.0, + "step": 8413 + }, + { + "epoch": 0.9240061497913463, + "grad_norm": 2.0073323249816895, + "learning_rate": 1e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.7166329622268677, + "num_tokens": 213210022.0, + "step": 8414 + }, + { + "epoch": 0.9241159674939601, + "grad_norm": 1.9074389934539795, + "learning_rate": 1e-06, + "loss": 1.0491, + "mean_token_accuracy": 0.6805504560470581, + "num_tokens": 213242953.0, + "step": 8415 + }, + { + "epoch": 0.9242257851965737, + "grad_norm": 1.9186735153198242, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7072607278823853, + "num_tokens": 213274625.0, + "step": 8416 + }, + { + "epoch": 0.9243356028991874, + "grad_norm": 2.4085731506347656, + "learning_rate": 1e-06, + "loss": 0.8831, + "mean_token_accuracy": 0.7243553400039673, + "num_tokens": 213296073.0, + "step": 8417 + }, + { + "epoch": 0.924445420601801, + "grad_norm": 2.234609842300415, + "learning_rate": 1e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.6868628263473511, + "num_tokens": 213322198.0, + "step": 8418 + }, + { + "epoch": 0.9245552383044147, + "grad_norm": 2.3691203594207764, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7147108316421509, + "num_tokens": 213344167.0, + "step": 8419 + }, + { + "epoch": 0.9246650560070283, + "grad_norm": 2.357841968536377, + "learning_rate": 1e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.7057265043258667, + "num_tokens": 213366048.0, + "step": 8420 + }, + { + "epoch": 0.924774873709642, + "grad_norm": 2.7298760414123535, + "learning_rate": 1e-06, + "loss": 0.8994, + "mean_token_accuracy": 0.7187268733978271, + "num_tokens": 213383909.0, + "step": 8421 + }, + { + "epoch": 0.9248846914122557, + "grad_norm": 2.238567590713501, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7055834531784058, + "num_tokens": 213409046.0, + "step": 8422 + }, + { + "epoch": 0.9249945091148694, + "grad_norm": 2.1368260383605957, + "learning_rate": 1e-06, + "loss": 1.1027, + "mean_token_accuracy": 0.6852219104766846, + "num_tokens": 213438451.0, + "step": 8423 + }, + { + "epoch": 0.925104326817483, + "grad_norm": 2.0808887481689453, + "learning_rate": 1e-06, + "loss": 1.0199, + "mean_token_accuracy": 0.691281795501709, + "num_tokens": 213469765.0, + "step": 8424 + }, + { + "epoch": 0.9252141445200966, + "grad_norm": 2.408838987350464, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7054198980331421, + "num_tokens": 213492093.0, + "step": 8425 + }, + { + "epoch": 0.9253239622227103, + "grad_norm": 2.2335636615753174, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7059401273727417, + "num_tokens": 213515887.0, + "step": 8426 + }, + { + "epoch": 0.9254337799253239, + "grad_norm": 2.1009440422058105, + "learning_rate": 1e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.7043315172195435, + "num_tokens": 213541524.0, + "step": 8427 + }, + { + "epoch": 0.9255435976279376, + "grad_norm": 1.9899922609329224, + "learning_rate": 1e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.7108383178710938, + "num_tokens": 213570785.0, + "step": 8428 + }, + { + "epoch": 0.9256534153305512, + "grad_norm": 2.2805676460266113, + "learning_rate": 1e-06, + "loss": 1.0, + "mean_token_accuracy": 0.7069501280784607, + "num_tokens": 213594448.0, + "step": 8429 + }, + { + "epoch": 0.925763233033165, + "grad_norm": 2.044055700302124, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7093686461448669, + "num_tokens": 213623618.0, + "step": 8430 + }, + { + "epoch": 0.9258730507357786, + "grad_norm": 2.125786066055298, + "learning_rate": 1e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.7006433606147766, + "num_tokens": 213651114.0, + "step": 8431 + }, + { + "epoch": 0.9259828684383923, + "grad_norm": 2.013098955154419, + "learning_rate": 1e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.6947523951530457, + "num_tokens": 213684098.0, + "step": 8432 + }, + { + "epoch": 0.9260926861410059, + "grad_norm": 2.2492451667785645, + "learning_rate": 1e-06, + "loss": 0.8925, + "mean_token_accuracy": 0.7259204387664795, + "num_tokens": 213707685.0, + "step": 8433 + }, + { + "epoch": 0.9262025038436196, + "grad_norm": 2.0540106296539307, + "learning_rate": 1e-06, + "loss": 0.8785, + "mean_token_accuracy": 0.7236736416816711, + "num_tokens": 213734601.0, + "step": 8434 + }, + { + "epoch": 0.9263123215462332, + "grad_norm": 2.040956735610962, + "learning_rate": 1e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.705156683921814, + "num_tokens": 213763121.0, + "step": 8435 + }, + { + "epoch": 0.9264221392488469, + "grad_norm": 2.1275086402893066, + "learning_rate": 1e-06, + "loss": 1.0017, + "mean_token_accuracy": 0.6921719312667847, + "num_tokens": 213788406.0, + "step": 8436 + }, + { + "epoch": 0.9265319569514606, + "grad_norm": 2.199005365371704, + "learning_rate": 1e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.7098309993743896, + "num_tokens": 213814492.0, + "step": 8437 + }, + { + "epoch": 0.9266417746540743, + "grad_norm": 1.794618844985962, + "learning_rate": 1e-06, + "loss": 0.9825, + "mean_token_accuracy": 0.6937313675880432, + "num_tokens": 213850547.0, + "step": 8438 + }, + { + "epoch": 0.9267515923566879, + "grad_norm": 2.217947006225586, + "learning_rate": 1e-06, + "loss": 0.9971, + "mean_token_accuracy": 0.696580708026886, + "num_tokens": 213874825.0, + "step": 8439 + }, + { + "epoch": 0.9268614100593016, + "grad_norm": 2.380837917327881, + "learning_rate": 1e-06, + "loss": 0.7751, + "mean_token_accuracy": 0.7495934963226318, + "num_tokens": 213896721.0, + "step": 8440 + }, + { + "epoch": 0.9269712277619152, + "grad_norm": 2.030266046524048, + "learning_rate": 1e-06, + "loss": 1.0061, + "mean_token_accuracy": 0.6932116746902466, + "num_tokens": 213925501.0, + "step": 8441 + }, + { + "epoch": 0.9270810454645289, + "grad_norm": 2.2900493144989014, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7112126350402832, + "num_tokens": 213947475.0, + "step": 8442 + }, + { + "epoch": 0.9271908631671425, + "grad_norm": 2.086918830871582, + "learning_rate": 1e-06, + "loss": 0.8733, + "mean_token_accuracy": 0.7280150651931763, + "num_tokens": 213970670.0, + "step": 8443 + }, + { + "epoch": 0.9273006808697563, + "grad_norm": 2.162490129470825, + "learning_rate": 1e-06, + "loss": 1.0067, + "mean_token_accuracy": 0.6898127794265747, + "num_tokens": 213999423.0, + "step": 8444 + }, + { + "epoch": 0.9274104985723699, + "grad_norm": 1.884495496749878, + "learning_rate": 1e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.6941341161727905, + "num_tokens": 214031382.0, + "step": 8445 + }, + { + "epoch": 0.9275203162749835, + "grad_norm": 2.7024810314178467, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.7126942276954651, + "num_tokens": 214048832.0, + "step": 8446 + }, + { + "epoch": 0.9276301339775972, + "grad_norm": 2.1876325607299805, + "learning_rate": 1e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.7021100521087646, + "num_tokens": 214076612.0, + "step": 8447 + }, + { + "epoch": 0.9277399516802108, + "grad_norm": 2.5805716514587402, + "learning_rate": 1e-06, + "loss": 0.8612, + "mean_token_accuracy": 0.7311299443244934, + "num_tokens": 214094496.0, + "step": 8448 + }, + { + "epoch": 0.9278497693828245, + "grad_norm": 2.0218844413757324, + "learning_rate": 1e-06, + "loss": 1.0127, + "mean_token_accuracy": 0.6901275515556335, + "num_tokens": 214123548.0, + "step": 8449 + }, + { + "epoch": 0.9279595870854381, + "grad_norm": 2.212678909301758, + "learning_rate": 1e-06, + "loss": 0.8735, + "mean_token_accuracy": 0.7274210453033447, + "num_tokens": 214147532.0, + "step": 8450 + }, + { + "epoch": 0.9280694047880519, + "grad_norm": 2.262615442276001, + "learning_rate": 1e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.6860721111297607, + "num_tokens": 214172083.0, + "step": 8451 + }, + { + "epoch": 0.9281792224906655, + "grad_norm": 2.080469846725464, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.7036279439926147, + "num_tokens": 214200373.0, + "step": 8452 + }, + { + "epoch": 0.9282890401932792, + "grad_norm": 2.1419622898101807, + "learning_rate": 1e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.7083301544189453, + "num_tokens": 214225360.0, + "step": 8453 + }, + { + "epoch": 0.9283988578958928, + "grad_norm": 2.3045151233673096, + "learning_rate": 1e-06, + "loss": 0.92, + "mean_token_accuracy": 0.716881275177002, + "num_tokens": 214248945.0, + "step": 8454 + }, + { + "epoch": 0.9285086755985065, + "grad_norm": 2.0684783458709717, + "learning_rate": 1e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.7007750868797302, + "num_tokens": 214275649.0, + "step": 8455 + }, + { + "epoch": 0.9286184933011201, + "grad_norm": 1.7460967302322388, + "learning_rate": 1e-06, + "loss": 1.0177, + "mean_token_accuracy": 0.6912864446640015, + "num_tokens": 214313584.0, + "step": 8456 + }, + { + "epoch": 0.9287283110037338, + "grad_norm": 2.0867362022399902, + "learning_rate": 1e-06, + "loss": 0.9366, + "mean_token_accuracy": 0.7107201814651489, + "num_tokens": 214341262.0, + "step": 8457 + }, + { + "epoch": 0.9288381287063474, + "grad_norm": 2.4590656757354736, + "learning_rate": 1e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.7160000801086426, + "num_tokens": 214362941.0, + "step": 8458 + }, + { + "epoch": 0.9289479464089612, + "grad_norm": 1.9545210599899292, + "learning_rate": 1e-06, + "loss": 1.0603, + "mean_token_accuracy": 0.6769180297851562, + "num_tokens": 214393612.0, + "step": 8459 + }, + { + "epoch": 0.9290577641115748, + "grad_norm": 2.244835615158081, + "learning_rate": 1e-06, + "loss": 0.9227, + "mean_token_accuracy": 0.7196629047393799, + "num_tokens": 214417794.0, + "step": 8460 + }, + { + "epoch": 0.9291675818141885, + "grad_norm": 2.0053176879882812, + "learning_rate": 1e-06, + "loss": 0.8764, + "mean_token_accuracy": 0.7206742763519287, + "num_tokens": 214446156.0, + "step": 8461 + }, + { + "epoch": 0.9292773995168021, + "grad_norm": 2.0728862285614014, + "learning_rate": 1e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.7220611572265625, + "num_tokens": 214475163.0, + "step": 8462 + }, + { + "epoch": 0.9293872172194158, + "grad_norm": 2.3700153827667236, + "learning_rate": 1e-06, + "loss": 0.8681, + "mean_token_accuracy": 0.7234822511672974, + "num_tokens": 214496528.0, + "step": 8463 + }, + { + "epoch": 0.9294970349220294, + "grad_norm": 2.5580921173095703, + "learning_rate": 1e-06, + "loss": 0.9723, + "mean_token_accuracy": 0.7010592818260193, + "num_tokens": 214516280.0, + "step": 8464 + }, + { + "epoch": 0.929606852624643, + "grad_norm": 1.8555043935775757, + "learning_rate": 1e-06, + "loss": 0.8799, + "mean_token_accuracy": 0.7285794615745544, + "num_tokens": 214550667.0, + "step": 8465 + }, + { + "epoch": 0.9297166703272568, + "grad_norm": 2.0300135612487793, + "learning_rate": 1e-06, + "loss": 0.9479, + "mean_token_accuracy": 0.7097183465957642, + "num_tokens": 214580623.0, + "step": 8466 + }, + { + "epoch": 0.9298264880298704, + "grad_norm": 2.16405987739563, + "learning_rate": 1e-06, + "loss": 0.9162, + "mean_token_accuracy": 0.7144861221313477, + "num_tokens": 214604959.0, + "step": 8467 + }, + { + "epoch": 0.9299363057324841, + "grad_norm": 2.294776678085327, + "learning_rate": 1e-06, + "loss": 0.981, + "mean_token_accuracy": 0.7014505863189697, + "num_tokens": 214627233.0, + "step": 8468 + }, + { + "epoch": 0.9300461234350977, + "grad_norm": 2.2276647090911865, + "learning_rate": 1e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.7212154865264893, + "num_tokens": 214650668.0, + "step": 8469 + }, + { + "epoch": 0.9301559411377114, + "grad_norm": 2.1993277072906494, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.7036958932876587, + "num_tokens": 214676854.0, + "step": 8470 + }, + { + "epoch": 0.930265758840325, + "grad_norm": 2.0190491676330566, + "learning_rate": 1e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7164936661720276, + "num_tokens": 214704644.0, + "step": 8471 + }, + { + "epoch": 0.9303755765429387, + "grad_norm": 2.284503221511841, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.7041338682174683, + "num_tokens": 214729124.0, + "step": 8472 + }, + { + "epoch": 0.9304853942455524, + "grad_norm": 2.1103203296661377, + "learning_rate": 1e-06, + "loss": 0.9748, + "mean_token_accuracy": 0.6952382922172546, + "num_tokens": 214756288.0, + "step": 8473 + }, + { + "epoch": 0.9305952119481661, + "grad_norm": 2.503542184829712, + "learning_rate": 1e-06, + "loss": 0.8818, + "mean_token_accuracy": 0.7285602688789368, + "num_tokens": 214774783.0, + "step": 8474 + }, + { + "epoch": 0.9307050296507797, + "grad_norm": 2.1455068588256836, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7109479308128357, + "num_tokens": 214800060.0, + "step": 8475 + }, + { + "epoch": 0.9308148473533934, + "grad_norm": 2.5237743854522705, + "learning_rate": 1e-06, + "loss": 0.8747, + "mean_token_accuracy": 0.7323848605155945, + "num_tokens": 214818765.0, + "step": 8476 + }, + { + "epoch": 0.930924665056007, + "grad_norm": 2.600192070007324, + "learning_rate": 1e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.7086930274963379, + "num_tokens": 214837514.0, + "step": 8477 + }, + { + "epoch": 0.9310344827586207, + "grad_norm": 2.6804416179656982, + "learning_rate": 1e-06, + "loss": 0.8881, + "mean_token_accuracy": 0.7168059349060059, + "num_tokens": 214855369.0, + "step": 8478 + }, + { + "epoch": 0.9311443004612343, + "grad_norm": 2.3237874507904053, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.703109860420227, + "num_tokens": 214879967.0, + "step": 8479 + }, + { + "epoch": 0.9312541181638481, + "grad_norm": 2.6115596294403076, + "learning_rate": 1e-06, + "loss": 0.8915, + "mean_token_accuracy": 0.7265480756759644, + "num_tokens": 214899868.0, + "step": 8480 + }, + { + "epoch": 0.9313639358664617, + "grad_norm": 2.1979899406433105, + "learning_rate": 1e-06, + "loss": 1.0165, + "mean_token_accuracy": 0.6947029829025269, + "num_tokens": 214926551.0, + "step": 8481 + }, + { + "epoch": 0.9314737535690754, + "grad_norm": 2.3238093852996826, + "learning_rate": 1e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.7034213542938232, + "num_tokens": 214954007.0, + "step": 8482 + }, + { + "epoch": 0.931583571271689, + "grad_norm": 2.4152185916900635, + "learning_rate": 1e-06, + "loss": 0.989, + "mean_token_accuracy": 0.6984217166900635, + "num_tokens": 214975749.0, + "step": 8483 + }, + { + "epoch": 0.9316933889743026, + "grad_norm": 2.0061256885528564, + "learning_rate": 1e-06, + "loss": 0.9346, + "mean_token_accuracy": 0.7115603089332581, + "num_tokens": 215003499.0, + "step": 8484 + }, + { + "epoch": 0.9318032066769163, + "grad_norm": 2.048555612564087, + "learning_rate": 1e-06, + "loss": 0.9279, + "mean_token_accuracy": 0.7183061242103577, + "num_tokens": 215031305.0, + "step": 8485 + }, + { + "epoch": 0.9319130243795299, + "grad_norm": 2.1188771724700928, + "learning_rate": 1e-06, + "loss": 1.0341, + "mean_token_accuracy": 0.6817041635513306, + "num_tokens": 215058692.0, + "step": 8486 + }, + { + "epoch": 0.9320228420821436, + "grad_norm": 2.1122663021087646, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.6995962858200073, + "num_tokens": 215085814.0, + "step": 8487 + }, + { + "epoch": 0.9321326597847573, + "grad_norm": 2.1484453678131104, + "learning_rate": 1e-06, + "loss": 1.0054, + "mean_token_accuracy": 0.6989195942878723, + "num_tokens": 215111454.0, + "step": 8488 + }, + { + "epoch": 0.932242477487371, + "grad_norm": 2.256016254425049, + "learning_rate": 1e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.7078596949577332, + "num_tokens": 215134933.0, + "step": 8489 + }, + { + "epoch": 0.9323522951899846, + "grad_norm": 2.4120852947235107, + "learning_rate": 1e-06, + "loss": 0.9766, + "mean_token_accuracy": 0.6963512897491455, + "num_tokens": 215156433.0, + "step": 8490 + }, + { + "epoch": 0.9324621128925983, + "grad_norm": 2.086970090866089, + "learning_rate": 1e-06, + "loss": 0.928, + "mean_token_accuracy": 0.710484504699707, + "num_tokens": 215180724.0, + "step": 8491 + }, + { + "epoch": 0.9325719305952119, + "grad_norm": 2.3583085536956787, + "learning_rate": 1e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.7091547250747681, + "num_tokens": 215202366.0, + "step": 8492 + }, + { + "epoch": 0.9326817482978256, + "grad_norm": 2.0709590911865234, + "learning_rate": 1e-06, + "loss": 0.9362, + "mean_token_accuracy": 0.7131731510162354, + "num_tokens": 215229852.0, + "step": 8493 + }, + { + "epoch": 0.9327915660004392, + "grad_norm": 2.1252894401550293, + "learning_rate": 1e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.6970971822738647, + "num_tokens": 215256923.0, + "step": 8494 + }, + { + "epoch": 0.932901383703053, + "grad_norm": 1.9910703897476196, + "learning_rate": 1e-06, + "loss": 0.9075, + "mean_token_accuracy": 0.7305769324302673, + "num_tokens": 215284800.0, + "step": 8495 + }, + { + "epoch": 0.9330112014056666, + "grad_norm": 2.279782772064209, + "learning_rate": 1e-06, + "loss": 0.9738, + "mean_token_accuracy": 0.7002356052398682, + "num_tokens": 215308470.0, + "step": 8496 + }, + { + "epoch": 0.9331210191082803, + "grad_norm": 2.169649600982666, + "learning_rate": 1e-06, + "loss": 0.9056, + "mean_token_accuracy": 0.7201744318008423, + "num_tokens": 215333527.0, + "step": 8497 + }, + { + "epoch": 0.9332308368108939, + "grad_norm": 2.3329620361328125, + "learning_rate": 1e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7271239757537842, + "num_tokens": 215355881.0, + "step": 8498 + }, + { + "epoch": 0.9333406545135076, + "grad_norm": 2.3053154945373535, + "learning_rate": 1e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.7238336801528931, + "num_tokens": 215378426.0, + "step": 8499 + }, + { + "epoch": 0.9334504722161212, + "grad_norm": 2.219845771789551, + "learning_rate": 1e-06, + "loss": 0.9487, + "mean_token_accuracy": 0.7078332901000977, + "num_tokens": 215404128.0, + "step": 8500 + }, + { + "epoch": 0.9335602899187349, + "grad_norm": 1.931808590888977, + "learning_rate": 1e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.6964951753616333, + "num_tokens": 215434530.0, + "step": 8501 + }, + { + "epoch": 0.9336701076213486, + "grad_norm": 2.2632827758789062, + "learning_rate": 1e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.700813353061676, + "num_tokens": 215457688.0, + "step": 8502 + }, + { + "epoch": 0.9337799253239623, + "grad_norm": 2.0733115673065186, + "learning_rate": 1e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.7185992002487183, + "num_tokens": 215484221.0, + "step": 8503 + }, + { + "epoch": 0.9338897430265759, + "grad_norm": 2.198516845703125, + "learning_rate": 1e-06, + "loss": 0.985, + "mean_token_accuracy": 0.7045930624008179, + "num_tokens": 215510652.0, + "step": 8504 + }, + { + "epoch": 0.9339995607291895, + "grad_norm": 2.0623934268951416, + "learning_rate": 1e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7051166296005249, + "num_tokens": 215536986.0, + "step": 8505 + }, + { + "epoch": 0.9341093784318032, + "grad_norm": 2.3707098960876465, + "learning_rate": 1e-06, + "loss": 0.9748, + "mean_token_accuracy": 0.7065379619598389, + "num_tokens": 215559427.0, + "step": 8506 + }, + { + "epoch": 0.9342191961344168, + "grad_norm": 2.1160073280334473, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.712378978729248, + "num_tokens": 215585762.0, + "step": 8507 + }, + { + "epoch": 0.9343290138370305, + "grad_norm": 2.556837320327759, + "learning_rate": 1e-06, + "loss": 0.9101, + "mean_token_accuracy": 0.7126380205154419, + "num_tokens": 215605559.0, + "step": 8508 + }, + { + "epoch": 0.9344388315396442, + "grad_norm": 2.3616714477539062, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7248243093490601, + "num_tokens": 215628015.0, + "step": 8509 + }, + { + "epoch": 0.9345486492422579, + "grad_norm": 2.4578447341918945, + "learning_rate": 1e-06, + "loss": 0.9256, + "mean_token_accuracy": 0.7156305313110352, + "num_tokens": 215647838.0, + "step": 8510 + }, + { + "epoch": 0.9346584669448715, + "grad_norm": 2.3330252170562744, + "learning_rate": 1e-06, + "loss": 0.9162, + "mean_token_accuracy": 0.72031569480896, + "num_tokens": 215669346.0, + "step": 8511 + }, + { + "epoch": 0.9347682846474852, + "grad_norm": 2.2457876205444336, + "learning_rate": 1e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7090344429016113, + "num_tokens": 215693005.0, + "step": 8512 + }, + { + "epoch": 0.9348781023500988, + "grad_norm": 2.1624338626861572, + "learning_rate": 1e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.6977096796035767, + "num_tokens": 215717559.0, + "step": 8513 + }, + { + "epoch": 0.9349879200527125, + "grad_norm": 2.0867815017700195, + "learning_rate": 1e-06, + "loss": 0.9743, + "mean_token_accuracy": 0.7040174007415771, + "num_tokens": 215746147.0, + "step": 8514 + }, + { + "epoch": 0.9350977377553261, + "grad_norm": 2.2544422149658203, + "learning_rate": 1e-06, + "loss": 0.871, + "mean_token_accuracy": 0.7259978652000427, + "num_tokens": 215766642.0, + "step": 8515 + }, + { + "epoch": 0.9352075554579398, + "grad_norm": 1.9310952425003052, + "learning_rate": 1e-06, + "loss": 1.0362, + "mean_token_accuracy": 0.689087986946106, + "num_tokens": 215798741.0, + "step": 8516 + }, + { + "epoch": 0.9353173731605535, + "grad_norm": 2.1294212341308594, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.7033603191375732, + "num_tokens": 215824506.0, + "step": 8517 + }, + { + "epoch": 0.9354271908631672, + "grad_norm": 2.2817864418029785, + "learning_rate": 1e-06, + "loss": 0.9274, + "mean_token_accuracy": 0.7166067361831665, + "num_tokens": 215848513.0, + "step": 8518 + }, + { + "epoch": 0.9355370085657808, + "grad_norm": 2.3175158500671387, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7151085138320923, + "num_tokens": 215872905.0, + "step": 8519 + }, + { + "epoch": 0.9356468262683945, + "grad_norm": 2.199150562286377, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7063359022140503, + "num_tokens": 215897761.0, + "step": 8520 + }, + { + "epoch": 0.9357566439710081, + "grad_norm": 2.3766214847564697, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.717157781124115, + "num_tokens": 215918585.0, + "step": 8521 + }, + { + "epoch": 0.9358664616736218, + "grad_norm": 2.2436513900756836, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7133026719093323, + "num_tokens": 215943179.0, + "step": 8522 + }, + { + "epoch": 0.9359762793762354, + "grad_norm": 2.0621204376220703, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.704333484172821, + "num_tokens": 215971428.0, + "step": 8523 + }, + { + "epoch": 0.9360860970788492, + "grad_norm": 2.570925712585449, + "learning_rate": 1e-06, + "loss": 0.8808, + "mean_token_accuracy": 0.7233844995498657, + "num_tokens": 215988558.0, + "step": 8524 + }, + { + "epoch": 0.9361959147814628, + "grad_norm": 1.9182294607162476, + "learning_rate": 1e-06, + "loss": 0.8951, + "mean_token_accuracy": 0.7209317684173584, + "num_tokens": 216018919.0, + "step": 8525 + }, + { + "epoch": 0.9363057324840764, + "grad_norm": 2.0944857597351074, + "learning_rate": 1e-06, + "loss": 0.9356, + "mean_token_accuracy": 0.708175539970398, + "num_tokens": 216045282.0, + "step": 8526 + }, + { + "epoch": 0.9364155501866901, + "grad_norm": 2.1775429248809814, + "learning_rate": 1e-06, + "loss": 0.8779, + "mean_token_accuracy": 0.7260898351669312, + "num_tokens": 216067653.0, + "step": 8527 + }, + { + "epoch": 0.9365253678893037, + "grad_norm": 2.2295422554016113, + "learning_rate": 1e-06, + "loss": 0.8966, + "mean_token_accuracy": 0.7367783784866333, + "num_tokens": 216090239.0, + "step": 8528 + }, + { + "epoch": 0.9366351855919174, + "grad_norm": 2.2749574184417725, + "learning_rate": 1e-06, + "loss": 0.8047, + "mean_token_accuracy": 0.7444226741790771, + "num_tokens": 216111288.0, + "step": 8529 + }, + { + "epoch": 0.936745003294531, + "grad_norm": 2.4334731101989746, + "learning_rate": 1e-06, + "loss": 0.9014, + "mean_token_accuracy": 0.7207191586494446, + "num_tokens": 216132146.0, + "step": 8530 + }, + { + "epoch": 0.9368548209971448, + "grad_norm": 2.1315455436706543, + "learning_rate": 1e-06, + "loss": 1.0234, + "mean_token_accuracy": 0.6899465322494507, + "num_tokens": 216160733.0, + "step": 8531 + }, + { + "epoch": 0.9369646386997584, + "grad_norm": 2.095121383666992, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.7011308670043945, + "num_tokens": 216188782.0, + "step": 8532 + }, + { + "epoch": 0.9370744564023721, + "grad_norm": 2.5914905071258545, + "learning_rate": 1e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.7102608680725098, + "num_tokens": 216207447.0, + "step": 8533 + }, + { + "epoch": 0.9371842741049857, + "grad_norm": 2.306879758834839, + "learning_rate": 1e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.709089994430542, + "num_tokens": 216230346.0, + "step": 8534 + }, + { + "epoch": 0.9372940918075994, + "grad_norm": 2.3438968658447266, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.706336259841919, + "num_tokens": 216253543.0, + "step": 8535 + }, + { + "epoch": 0.937403909510213, + "grad_norm": 1.9808505773544312, + "learning_rate": 1e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.7207591533660889, + "num_tokens": 216281752.0, + "step": 8536 + }, + { + "epoch": 0.9375137272128267, + "grad_norm": 1.994850516319275, + "learning_rate": 1e-06, + "loss": 1.0069, + "mean_token_accuracy": 0.6948193311691284, + "num_tokens": 216310504.0, + "step": 8537 + }, + { + "epoch": 0.9376235449154404, + "grad_norm": 2.0542244911193848, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.703545331954956, + "num_tokens": 216339140.0, + "step": 8538 + }, + { + "epoch": 0.9377333626180541, + "grad_norm": 2.1301400661468506, + "learning_rate": 1e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.7001961469650269, + "num_tokens": 216366051.0, + "step": 8539 + }, + { + "epoch": 0.9378431803206677, + "grad_norm": 2.078813076019287, + "learning_rate": 1e-06, + "loss": 0.932, + "mean_token_accuracy": 0.7167161107063293, + "num_tokens": 216392210.0, + "step": 8540 + }, + { + "epoch": 0.9379529980232814, + "grad_norm": 2.225604295730591, + "learning_rate": 1e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.7133097648620605, + "num_tokens": 216416140.0, + "step": 8541 + }, + { + "epoch": 0.938062815725895, + "grad_norm": 2.397429943084717, + "learning_rate": 1e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.7155817151069641, + "num_tokens": 216439079.0, + "step": 8542 + }, + { + "epoch": 0.9381726334285087, + "grad_norm": 2.3365585803985596, + "learning_rate": 1e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.711562991142273, + "num_tokens": 216463070.0, + "step": 8543 + }, + { + "epoch": 0.9382824511311223, + "grad_norm": 2.037884473800659, + "learning_rate": 1e-06, + "loss": 1.0135, + "mean_token_accuracy": 0.7007534503936768, + "num_tokens": 216492345.0, + "step": 8544 + }, + { + "epoch": 0.9383922688337359, + "grad_norm": 2.1553330421447754, + "learning_rate": 1e-06, + "loss": 0.928, + "mean_token_accuracy": 0.7106626629829407, + "num_tokens": 216518037.0, + "step": 8545 + }, + { + "epoch": 0.9385020865363497, + "grad_norm": 2.1597518920898438, + "learning_rate": 1e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.7169021964073181, + "num_tokens": 216543485.0, + "step": 8546 + }, + { + "epoch": 0.9386119042389633, + "grad_norm": 1.993262767791748, + "learning_rate": 1e-06, + "loss": 0.9971, + "mean_token_accuracy": 0.6961435079574585, + "num_tokens": 216572592.0, + "step": 8547 + }, + { + "epoch": 0.938721721941577, + "grad_norm": 2.279572010040283, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.7035963535308838, + "num_tokens": 216597499.0, + "step": 8548 + }, + { + "epoch": 0.9388315396441906, + "grad_norm": 2.3120288848876953, + "learning_rate": 1e-06, + "loss": 0.9453, + "mean_token_accuracy": 0.7127470970153809, + "num_tokens": 216620456.0, + "step": 8549 + }, + { + "epoch": 0.9389413573468043, + "grad_norm": 2.1886098384857178, + "learning_rate": 1e-06, + "loss": 1.0272, + "mean_token_accuracy": 0.6912158131599426, + "num_tokens": 216646559.0, + "step": 8550 + }, + { + "epoch": 0.9390511750494179, + "grad_norm": 2.078629493713379, + "learning_rate": 1e-06, + "loss": 0.8692, + "mean_token_accuracy": 0.7351636290550232, + "num_tokens": 216672612.0, + "step": 8551 + }, + { + "epoch": 0.9391609927520316, + "grad_norm": 2.417973041534424, + "learning_rate": 1e-06, + "loss": 0.8376, + "mean_token_accuracy": 0.7354027628898621, + "num_tokens": 216693361.0, + "step": 8552 + }, + { + "epoch": 0.9392708104546453, + "grad_norm": 2.6307592391967773, + "learning_rate": 1e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.700661301612854, + "num_tokens": 216713818.0, + "step": 8553 + }, + { + "epoch": 0.939380628157259, + "grad_norm": 2.8057973384857178, + "learning_rate": 1e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.6990131139755249, + "num_tokens": 216731811.0, + "step": 8554 + }, + { + "epoch": 0.9394904458598726, + "grad_norm": 2.191955327987671, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7082308530807495, + "num_tokens": 216757595.0, + "step": 8555 + }, + { + "epoch": 0.9396002635624863, + "grad_norm": 2.1762819290161133, + "learning_rate": 1e-06, + "loss": 1.0351, + "mean_token_accuracy": 0.6931255459785461, + "num_tokens": 216783595.0, + "step": 8556 + }, + { + "epoch": 0.9397100812650999, + "grad_norm": 2.0898923873901367, + "learning_rate": 1e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.7094939351081848, + "num_tokens": 216812184.0, + "step": 8557 + }, + { + "epoch": 0.9398198989677136, + "grad_norm": 2.1571125984191895, + "learning_rate": 1e-06, + "loss": 1.0043, + "mean_token_accuracy": 0.6910567283630371, + "num_tokens": 216837577.0, + "step": 8558 + }, + { + "epoch": 0.9399297166703272, + "grad_norm": 2.5375747680664062, + "learning_rate": 1e-06, + "loss": 0.8579, + "mean_token_accuracy": 0.7284513711929321, + "num_tokens": 216856774.0, + "step": 8559 + }, + { + "epoch": 0.940039534372941, + "grad_norm": 2.2463793754577637, + "learning_rate": 1e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.7026649713516235, + "num_tokens": 216880760.0, + "step": 8560 + }, + { + "epoch": 0.9401493520755546, + "grad_norm": 2.1500930786132812, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7170683145523071, + "num_tokens": 216906393.0, + "step": 8561 + }, + { + "epoch": 0.9402591697781683, + "grad_norm": 2.039607048034668, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.6986467838287354, + "num_tokens": 216936570.0, + "step": 8562 + }, + { + "epoch": 0.9403689874807819, + "grad_norm": 2.3729584217071533, + "learning_rate": 1e-06, + "loss": 0.922, + "mean_token_accuracy": 0.7220678329467773, + "num_tokens": 216957775.0, + "step": 8563 + }, + { + "epoch": 0.9404788051833955, + "grad_norm": 2.0484633445739746, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7098134756088257, + "num_tokens": 216985595.0, + "step": 8564 + }, + { + "epoch": 0.9405886228860092, + "grad_norm": 2.0245025157928467, + "learning_rate": 1e-06, + "loss": 0.9453, + "mean_token_accuracy": 0.7104275226593018, + "num_tokens": 217015632.0, + "step": 8565 + }, + { + "epoch": 0.9406984405886228, + "grad_norm": 2.315854549407959, + "learning_rate": 1e-06, + "loss": 0.9087, + "mean_token_accuracy": 0.723617434501648, + "num_tokens": 217037582.0, + "step": 8566 + }, + { + "epoch": 0.9408082582912366, + "grad_norm": 2.227329969406128, + "learning_rate": 1e-06, + "loss": 0.9065, + "mean_token_accuracy": 0.7237249612808228, + "num_tokens": 217060820.0, + "step": 8567 + }, + { + "epoch": 0.9409180759938502, + "grad_norm": 1.9931107759475708, + "learning_rate": 1e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.708137571811676, + "num_tokens": 217090706.0, + "step": 8568 + }, + { + "epoch": 0.9410278936964639, + "grad_norm": 2.1002249717712402, + "learning_rate": 1e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.7058916091918945, + "num_tokens": 217119866.0, + "step": 8569 + }, + { + "epoch": 0.9411377113990775, + "grad_norm": 2.380359411239624, + "learning_rate": 1e-06, + "loss": 0.9087, + "mean_token_accuracy": 0.7222351431846619, + "num_tokens": 217141019.0, + "step": 8570 + }, + { + "epoch": 0.9412475291016912, + "grad_norm": 2.1761648654937744, + "learning_rate": 1e-06, + "loss": 0.8932, + "mean_token_accuracy": 0.7249487638473511, + "num_tokens": 217167803.0, + "step": 8571 + }, + { + "epoch": 0.9413573468043048, + "grad_norm": 2.246943235397339, + "learning_rate": 1e-06, + "loss": 1.018, + "mean_token_accuracy": 0.6950874328613281, + "num_tokens": 217193205.0, + "step": 8572 + }, + { + "epoch": 0.9414671645069185, + "grad_norm": 2.174057960510254, + "learning_rate": 1e-06, + "loss": 0.8293, + "mean_token_accuracy": 0.7401505708694458, + "num_tokens": 217215679.0, + "step": 8573 + }, + { + "epoch": 0.9415769822095322, + "grad_norm": 2.0776302814483643, + "learning_rate": 1e-06, + "loss": 0.9025, + "mean_token_accuracy": 0.7142176628112793, + "num_tokens": 217245230.0, + "step": 8574 + }, + { + "epoch": 0.9416867999121459, + "grad_norm": 2.076575756072998, + "learning_rate": 1e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.6979820728302002, + "num_tokens": 217273128.0, + "step": 8575 + }, + { + "epoch": 0.9417966176147595, + "grad_norm": 2.059251308441162, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7074565887451172, + "num_tokens": 217302153.0, + "step": 8576 + }, + { + "epoch": 0.9419064353173732, + "grad_norm": 2.0781326293945312, + "learning_rate": 1e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.7050462961196899, + "num_tokens": 217329094.0, + "step": 8577 + }, + { + "epoch": 0.9420162530199868, + "grad_norm": 2.425055980682373, + "learning_rate": 1e-06, + "loss": 0.8842, + "mean_token_accuracy": 0.7208386659622192, + "num_tokens": 217349212.0, + "step": 8578 + }, + { + "epoch": 0.9421260707226005, + "grad_norm": 2.1820788383483887, + "learning_rate": 1e-06, + "loss": 0.8852, + "mean_token_accuracy": 0.7240817546844482, + "num_tokens": 217375842.0, + "step": 8579 + }, + { + "epoch": 0.9422358884252141, + "grad_norm": 2.383866786956787, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7130138278007507, + "num_tokens": 217398249.0, + "step": 8580 + }, + { + "epoch": 0.9423457061278278, + "grad_norm": 2.131978750228882, + "learning_rate": 1e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.7120022773742676, + "num_tokens": 217423615.0, + "step": 8581 + }, + { + "epoch": 0.9424555238304415, + "grad_norm": 2.1543023586273193, + "learning_rate": 1e-06, + "loss": 1.0546, + "mean_token_accuracy": 0.6846561431884766, + "num_tokens": 217450144.0, + "step": 8582 + }, + { + "epoch": 0.9425653415330552, + "grad_norm": 2.650001049041748, + "learning_rate": 1e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.7156929969787598, + "num_tokens": 217468121.0, + "step": 8583 + }, + { + "epoch": 0.9426751592356688, + "grad_norm": 2.044621467590332, + "learning_rate": 1e-06, + "loss": 1.0063, + "mean_token_accuracy": 0.6918411254882812, + "num_tokens": 217496887.0, + "step": 8584 + }, + { + "epoch": 0.9427849769382824, + "grad_norm": 2.3347439765930176, + "learning_rate": 1e-06, + "loss": 0.7913, + "mean_token_accuracy": 0.750401496887207, + "num_tokens": 217517586.0, + "step": 8585 + }, + { + "epoch": 0.9428947946408961, + "grad_norm": 1.9468404054641724, + "learning_rate": 1e-06, + "loss": 1.0006, + "mean_token_accuracy": 0.7006839513778687, + "num_tokens": 217550097.0, + "step": 8586 + }, + { + "epoch": 0.9430046123435097, + "grad_norm": 2.215399742126465, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7070120573043823, + "num_tokens": 217573128.0, + "step": 8587 + }, + { + "epoch": 0.9431144300461234, + "grad_norm": 2.3798413276672363, + "learning_rate": 1e-06, + "loss": 0.9043, + "mean_token_accuracy": 0.7206777334213257, + "num_tokens": 217595863.0, + "step": 8588 + }, + { + "epoch": 0.9432242477487371, + "grad_norm": 2.50154185295105, + "learning_rate": 1e-06, + "loss": 0.8508, + "mean_token_accuracy": 0.7386218905448914, + "num_tokens": 217614863.0, + "step": 8589 + }, + { + "epoch": 0.9433340654513508, + "grad_norm": 2.4368577003479004, + "learning_rate": 1e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.7193313241004944, + "num_tokens": 217636873.0, + "step": 8590 + }, + { + "epoch": 0.9434438831539644, + "grad_norm": 2.189347505569458, + "learning_rate": 1e-06, + "loss": 0.9282, + "mean_token_accuracy": 0.7088987231254578, + "num_tokens": 217661345.0, + "step": 8591 + }, + { + "epoch": 0.9435537008565781, + "grad_norm": 2.2318663597106934, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7078834772109985, + "num_tokens": 217685195.0, + "step": 8592 + }, + { + "epoch": 0.9436635185591917, + "grad_norm": 2.209597110748291, + "learning_rate": 1e-06, + "loss": 0.856, + "mean_token_accuracy": 0.7307184934616089, + "num_tokens": 217708657.0, + "step": 8593 + }, + { + "epoch": 0.9437733362618054, + "grad_norm": 1.8788906335830688, + "learning_rate": 1e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.7035639882087708, + "num_tokens": 217740836.0, + "step": 8594 + }, + { + "epoch": 0.943883153964419, + "grad_norm": 2.0421135425567627, + "learning_rate": 1e-06, + "loss": 1.0156, + "mean_token_accuracy": 0.6931719779968262, + "num_tokens": 217768377.0, + "step": 8595 + }, + { + "epoch": 0.9439929716670328, + "grad_norm": 2.2063279151916504, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7203284502029419, + "num_tokens": 217793346.0, + "step": 8596 + }, + { + "epoch": 0.9441027893696464, + "grad_norm": 1.7999472618103027, + "learning_rate": 1e-06, + "loss": 0.9024, + "mean_token_accuracy": 0.7236909866333008, + "num_tokens": 217824669.0, + "step": 8597 + }, + { + "epoch": 0.9442126070722601, + "grad_norm": 2.2319130897521973, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.7303824424743652, + "num_tokens": 217850510.0, + "step": 8598 + }, + { + "epoch": 0.9443224247748737, + "grad_norm": 2.2695226669311523, + "learning_rate": 1e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.7014510035514832, + "num_tokens": 217873556.0, + "step": 8599 + }, + { + "epoch": 0.9444322424774874, + "grad_norm": 2.2160165309906006, + "learning_rate": 1e-06, + "loss": 1.0131, + "mean_token_accuracy": 0.6948901414871216, + "num_tokens": 217900498.0, + "step": 8600 + }, + { + "epoch": 0.944542060180101, + "grad_norm": 2.2828927040100098, + "learning_rate": 1e-06, + "loss": 0.9841, + "mean_token_accuracy": 0.7019733190536499, + "num_tokens": 217923380.0, + "step": 8601 + }, + { + "epoch": 0.9446518778827147, + "grad_norm": 2.0783402919769287, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.708836555480957, + "num_tokens": 217951370.0, + "step": 8602 + }, + { + "epoch": 0.9447616955853284, + "grad_norm": 2.257180690765381, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7174293398857117, + "num_tokens": 217975458.0, + "step": 8603 + }, + { + "epoch": 0.944871513287942, + "grad_norm": 2.176988124847412, + "learning_rate": 1e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.7229608297348022, + "num_tokens": 217999448.0, + "step": 8604 + }, + { + "epoch": 0.9449813309905557, + "grad_norm": 2.0742411613464355, + "learning_rate": 1e-06, + "loss": 0.9919, + "mean_token_accuracy": 0.6990053653717041, + "num_tokens": 218026051.0, + "step": 8605 + }, + { + "epoch": 0.9450911486931693, + "grad_norm": 2.1369428634643555, + "learning_rate": 1e-06, + "loss": 1.0269, + "mean_token_accuracy": 0.6926634311676025, + "num_tokens": 218054274.0, + "step": 8606 + }, + { + "epoch": 0.945200966395783, + "grad_norm": 2.19372296333313, + "learning_rate": 1e-06, + "loss": 1.0478, + "mean_token_accuracy": 0.6839392185211182, + "num_tokens": 218082153.0, + "step": 8607 + }, + { + "epoch": 0.9453107840983966, + "grad_norm": 1.8147120475769043, + "learning_rate": 1e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.7012539505958557, + "num_tokens": 218115480.0, + "step": 8608 + }, + { + "epoch": 0.9454206018010103, + "grad_norm": 2.1396753787994385, + "learning_rate": 1e-06, + "loss": 1.0222, + "mean_token_accuracy": 0.6849380731582642, + "num_tokens": 218143329.0, + "step": 8609 + }, + { + "epoch": 0.9455304195036239, + "grad_norm": 2.145493984222412, + "learning_rate": 1e-06, + "loss": 1.035, + "mean_token_accuracy": 0.6836754083633423, + "num_tokens": 218169549.0, + "step": 8610 + }, + { + "epoch": 0.9456402372062377, + "grad_norm": 1.8699613809585571, + "learning_rate": 1e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7111645936965942, + "num_tokens": 218200105.0, + "step": 8611 + }, + { + "epoch": 0.9457500549088513, + "grad_norm": 2.115422487258911, + "learning_rate": 1e-06, + "loss": 0.8939, + "mean_token_accuracy": 0.7286150455474854, + "num_tokens": 218223749.0, + "step": 8612 + }, + { + "epoch": 0.945859872611465, + "grad_norm": 2.2364144325256348, + "learning_rate": 1e-06, + "loss": 0.9586, + "mean_token_accuracy": 0.7030878067016602, + "num_tokens": 218247267.0, + "step": 8613 + }, + { + "epoch": 0.9459696903140786, + "grad_norm": 2.8163487911224365, + "learning_rate": 1e-06, + "loss": 0.9115, + "mean_token_accuracy": 0.7146226167678833, + "num_tokens": 218263742.0, + "step": 8614 + }, + { + "epoch": 0.9460795080166923, + "grad_norm": 2.1571648120880127, + "learning_rate": 1e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.6966111660003662, + "num_tokens": 218288329.0, + "step": 8615 + }, + { + "epoch": 0.9461893257193059, + "grad_norm": 2.2216076850891113, + "learning_rate": 1e-06, + "loss": 1.0205, + "mean_token_accuracy": 0.6941189169883728, + "num_tokens": 218313291.0, + "step": 8616 + }, + { + "epoch": 0.9462991434219196, + "grad_norm": 2.3771705627441406, + "learning_rate": 1e-06, + "loss": 0.891, + "mean_token_accuracy": 0.720149040222168, + "num_tokens": 218333352.0, + "step": 8617 + }, + { + "epoch": 0.9464089611245333, + "grad_norm": 2.059826135635376, + "learning_rate": 1e-06, + "loss": 0.994, + "mean_token_accuracy": 0.6985909938812256, + "num_tokens": 218360294.0, + "step": 8618 + }, + { + "epoch": 0.946518778827147, + "grad_norm": 2.202371835708618, + "learning_rate": 1e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.7220090627670288, + "num_tokens": 218384390.0, + "step": 8619 + }, + { + "epoch": 0.9466285965297606, + "grad_norm": 2.043832540512085, + "learning_rate": 1e-06, + "loss": 1.0321, + "mean_token_accuracy": 0.6816172003746033, + "num_tokens": 218412843.0, + "step": 8620 + }, + { + "epoch": 0.9467384142323743, + "grad_norm": 1.8242698907852173, + "learning_rate": 1e-06, + "loss": 1.0591, + "mean_token_accuracy": 0.6847515106201172, + "num_tokens": 218450333.0, + "step": 8621 + }, + { + "epoch": 0.9468482319349879, + "grad_norm": 2.5906434059143066, + "learning_rate": 1e-06, + "loss": 0.9587, + "mean_token_accuracy": 0.7020195722579956, + "num_tokens": 218471349.0, + "step": 8622 + }, + { + "epoch": 0.9469580496376016, + "grad_norm": 2.1587071418762207, + "learning_rate": 1e-06, + "loss": 1.0342, + "mean_token_accuracy": 0.6875725984573364, + "num_tokens": 218499383.0, + "step": 8623 + }, + { + "epoch": 0.9470678673402152, + "grad_norm": 2.2201926708221436, + "learning_rate": 1e-06, + "loss": 0.9098, + "mean_token_accuracy": 0.7211278676986694, + "num_tokens": 218524541.0, + "step": 8624 + }, + { + "epoch": 0.947177685042829, + "grad_norm": 2.168802261352539, + "learning_rate": 1e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.717968761920929, + "num_tokens": 218550250.0, + "step": 8625 + }, + { + "epoch": 0.9472875027454426, + "grad_norm": 2.1200342178344727, + "learning_rate": 1e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.7044819593429565, + "num_tokens": 218578213.0, + "step": 8626 + }, + { + "epoch": 0.9473973204480562, + "grad_norm": 2.3166513442993164, + "learning_rate": 1e-06, + "loss": 0.868, + "mean_token_accuracy": 0.7295907139778137, + "num_tokens": 218600057.0, + "step": 8627 + }, + { + "epoch": 0.9475071381506699, + "grad_norm": 2.2584753036499023, + "learning_rate": 1e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7198959589004517, + "num_tokens": 218624138.0, + "step": 8628 + }, + { + "epoch": 0.9476169558532835, + "grad_norm": 2.1838903427124023, + "learning_rate": 1e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.7145119905471802, + "num_tokens": 218650463.0, + "step": 8629 + }, + { + "epoch": 0.9477267735558972, + "grad_norm": 2.1787424087524414, + "learning_rate": 1e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.7074492573738098, + "num_tokens": 218676724.0, + "step": 8630 + }, + { + "epoch": 0.9478365912585108, + "grad_norm": 2.15960955619812, + "learning_rate": 1e-06, + "loss": 1.0061, + "mean_token_accuracy": 0.6993882656097412, + "num_tokens": 218702891.0, + "step": 8631 + }, + { + "epoch": 0.9479464089611246, + "grad_norm": 2.58890962600708, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7144193053245544, + "num_tokens": 218722366.0, + "step": 8632 + }, + { + "epoch": 0.9480562266637382, + "grad_norm": 2.5528600215911865, + "learning_rate": 1e-06, + "loss": 0.8653, + "mean_token_accuracy": 0.7284688949584961, + "num_tokens": 218742422.0, + "step": 8633 + }, + { + "epoch": 0.9481660443663519, + "grad_norm": 2.073385000228882, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7171133756637573, + "num_tokens": 218768237.0, + "step": 8634 + }, + { + "epoch": 0.9482758620689655, + "grad_norm": 2.2258243560791016, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.7117064595222473, + "num_tokens": 218792302.0, + "step": 8635 + }, + { + "epoch": 0.9483856797715792, + "grad_norm": 2.3599181175231934, + "learning_rate": 1e-06, + "loss": 0.9048, + "mean_token_accuracy": 0.7181810140609741, + "num_tokens": 218814436.0, + "step": 8636 + }, + { + "epoch": 0.9484954974741928, + "grad_norm": 2.330296516418457, + "learning_rate": 1e-06, + "loss": 0.8571, + "mean_token_accuracy": 0.735895037651062, + "num_tokens": 218835874.0, + "step": 8637 + }, + { + "epoch": 0.9486053151768065, + "grad_norm": 2.180070161819458, + "learning_rate": 1e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.7049778699874878, + "num_tokens": 218862000.0, + "step": 8638 + }, + { + "epoch": 0.9487151328794201, + "grad_norm": 2.1332738399505615, + "learning_rate": 1e-06, + "loss": 0.8718, + "mean_token_accuracy": 0.727527379989624, + "num_tokens": 218885785.0, + "step": 8639 + }, + { + "epoch": 0.9488249505820339, + "grad_norm": 2.633101224899292, + "learning_rate": 1e-06, + "loss": 0.8268, + "mean_token_accuracy": 0.7419828176498413, + "num_tokens": 218904126.0, + "step": 8640 + }, + { + "epoch": 0.9489347682846475, + "grad_norm": 2.5389575958251953, + "learning_rate": 1e-06, + "loss": 0.9823, + "mean_token_accuracy": 0.7000716328620911, + "num_tokens": 218925231.0, + "step": 8641 + }, + { + "epoch": 0.9490445859872612, + "grad_norm": 2.3487675189971924, + "learning_rate": 1e-06, + "loss": 0.9386, + "mean_token_accuracy": 0.7125617265701294, + "num_tokens": 218946796.0, + "step": 8642 + }, + { + "epoch": 0.9491544036898748, + "grad_norm": 2.116300344467163, + "learning_rate": 1e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.7216840386390686, + "num_tokens": 218971555.0, + "step": 8643 + }, + { + "epoch": 0.9492642213924884, + "grad_norm": 2.0899977684020996, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.7065069675445557, + "num_tokens": 219001255.0, + "step": 8644 + }, + { + "epoch": 0.9493740390951021, + "grad_norm": 2.12347412109375, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.6992666721343994, + "num_tokens": 219028237.0, + "step": 8645 + }, + { + "epoch": 0.9494838567977157, + "grad_norm": 2.209686756134033, + "learning_rate": 1e-06, + "loss": 0.9037, + "mean_token_accuracy": 0.720843493938446, + "num_tokens": 219051489.0, + "step": 8646 + }, + { + "epoch": 0.9495936745003295, + "grad_norm": 2.2341854572296143, + "learning_rate": 1e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.707166850566864, + "num_tokens": 219076876.0, + "step": 8647 + }, + { + "epoch": 0.9497034922029431, + "grad_norm": 2.392915725708008, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7058899402618408, + "num_tokens": 219098623.0, + "step": 8648 + }, + { + "epoch": 0.9498133099055568, + "grad_norm": 2.013077974319458, + "learning_rate": 1e-06, + "loss": 1.0269, + "mean_token_accuracy": 0.6877212524414062, + "num_tokens": 219126636.0, + "step": 8649 + }, + { + "epoch": 0.9499231276081704, + "grad_norm": 1.9888633489608765, + "learning_rate": 1e-06, + "loss": 1.014, + "mean_token_accuracy": 0.7024999856948853, + "num_tokens": 219156214.0, + "step": 8650 + }, + { + "epoch": 0.9500329453107841, + "grad_norm": 2.316746711730957, + "learning_rate": 1e-06, + "loss": 1.0066, + "mean_token_accuracy": 0.6969602108001709, + "num_tokens": 219180962.0, + "step": 8651 + }, + { + "epoch": 0.9501427630133977, + "grad_norm": 2.311295986175537, + "learning_rate": 1e-06, + "loss": 0.9122, + "mean_token_accuracy": 0.7158635258674622, + "num_tokens": 219201338.0, + "step": 8652 + }, + { + "epoch": 0.9502525807160114, + "grad_norm": 2.2415688037872314, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.712863564491272, + "num_tokens": 219224320.0, + "step": 8653 + }, + { + "epoch": 0.9503623984186251, + "grad_norm": 1.849382996559143, + "learning_rate": 1e-06, + "loss": 0.9871, + "mean_token_accuracy": 0.6995133757591248, + "num_tokens": 219256881.0, + "step": 8654 + }, + { + "epoch": 0.9504722161212388, + "grad_norm": 2.1316897869110107, + "learning_rate": 1e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.7080024480819702, + "num_tokens": 219283589.0, + "step": 8655 + }, + { + "epoch": 0.9505820338238524, + "grad_norm": 2.2053892612457275, + "learning_rate": 1e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.6982218027114868, + "num_tokens": 219308950.0, + "step": 8656 + }, + { + "epoch": 0.9506918515264661, + "grad_norm": 1.9388456344604492, + "learning_rate": 1e-06, + "loss": 0.9926, + "mean_token_accuracy": 0.6984675526618958, + "num_tokens": 219340070.0, + "step": 8657 + }, + { + "epoch": 0.9508016692290797, + "grad_norm": 2.1235549449920654, + "learning_rate": 1e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.6993290185928345, + "num_tokens": 219365950.0, + "step": 8658 + }, + { + "epoch": 0.9509114869316934, + "grad_norm": 2.2999658584594727, + "learning_rate": 1e-06, + "loss": 0.878, + "mean_token_accuracy": 0.7294905185699463, + "num_tokens": 219388014.0, + "step": 8659 + }, + { + "epoch": 0.951021304634307, + "grad_norm": 2.3475444316864014, + "learning_rate": 1e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7249470949172974, + "num_tokens": 219408798.0, + "step": 8660 + }, + { + "epoch": 0.9511311223369208, + "grad_norm": 2.0954508781433105, + "learning_rate": 1e-06, + "loss": 1.024, + "mean_token_accuracy": 0.6957831382751465, + "num_tokens": 219434624.0, + "step": 8661 + }, + { + "epoch": 0.9512409400395344, + "grad_norm": 2.193681478500366, + "learning_rate": 1e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7108399271965027, + "num_tokens": 219457165.0, + "step": 8662 + }, + { + "epoch": 0.9513507577421481, + "grad_norm": 2.1832821369171143, + "learning_rate": 1e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.7058779001235962, + "num_tokens": 219482056.0, + "step": 8663 + }, + { + "epoch": 0.9514605754447617, + "grad_norm": 2.536804437637329, + "learning_rate": 1e-06, + "loss": 0.8597, + "mean_token_accuracy": 0.7304261922836304, + "num_tokens": 219500109.0, + "step": 8664 + }, + { + "epoch": 0.9515703931473753, + "grad_norm": 2.1780948638916016, + "learning_rate": 1e-06, + "loss": 1.0345, + "mean_token_accuracy": 0.6875532269477844, + "num_tokens": 219526304.0, + "step": 8665 + }, + { + "epoch": 0.951680210849989, + "grad_norm": 2.1103508472442627, + "learning_rate": 1e-06, + "loss": 1.0757, + "mean_token_accuracy": 0.6759660243988037, + "num_tokens": 219553650.0, + "step": 8666 + }, + { + "epoch": 0.9517900285526026, + "grad_norm": 2.365363836288452, + "learning_rate": 1e-06, + "loss": 0.9749, + "mean_token_accuracy": 0.7057931423187256, + "num_tokens": 219578176.0, + "step": 8667 + }, + { + "epoch": 0.9518998462552163, + "grad_norm": 2.19675350189209, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.7104979753494263, + "num_tokens": 219603209.0, + "step": 8668 + }, + { + "epoch": 0.95200966395783, + "grad_norm": 2.2974960803985596, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7117268443107605, + "num_tokens": 219626364.0, + "step": 8669 + }, + { + "epoch": 0.9521194816604437, + "grad_norm": 2.099090576171875, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7110208868980408, + "num_tokens": 219654773.0, + "step": 8670 + }, + { + "epoch": 0.9522292993630573, + "grad_norm": 2.026038885116577, + "learning_rate": 1e-06, + "loss": 0.9471, + "mean_token_accuracy": 0.7141197919845581, + "num_tokens": 219683783.0, + "step": 8671 + }, + { + "epoch": 0.952339117065671, + "grad_norm": 2.413892984390259, + "learning_rate": 1e-06, + "loss": 0.9858, + "mean_token_accuracy": 0.708012044429779, + "num_tokens": 219709378.0, + "step": 8672 + }, + { + "epoch": 0.9524489347682846, + "grad_norm": 2.1672394275665283, + "learning_rate": 1e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.6990489959716797, + "num_tokens": 219736819.0, + "step": 8673 + }, + { + "epoch": 0.9525587524708983, + "grad_norm": 2.2438621520996094, + "learning_rate": 1e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.712921142578125, + "num_tokens": 219758787.0, + "step": 8674 + }, + { + "epoch": 0.9526685701735119, + "grad_norm": 2.059648036956787, + "learning_rate": 1e-06, + "loss": 0.8431, + "mean_token_accuracy": 0.7315671443939209, + "num_tokens": 219782987.0, + "step": 8675 + }, + { + "epoch": 0.9527783878761257, + "grad_norm": 1.911432147026062, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.6968383193016052, + "num_tokens": 219815636.0, + "step": 8676 + }, + { + "epoch": 0.9528882055787393, + "grad_norm": 2.425053358078003, + "learning_rate": 1e-06, + "loss": 0.8717, + "mean_token_accuracy": 0.7262464165687561, + "num_tokens": 219836865.0, + "step": 8677 + }, + { + "epoch": 0.952998023281353, + "grad_norm": 2.3952090740203857, + "learning_rate": 1e-06, + "loss": 0.8617, + "mean_token_accuracy": 0.72984379529953, + "num_tokens": 219857932.0, + "step": 8678 + }, + { + "epoch": 0.9531078409839666, + "grad_norm": 1.8379383087158203, + "learning_rate": 1e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.6946258544921875, + "num_tokens": 219892050.0, + "step": 8679 + }, + { + "epoch": 0.9532176586865803, + "grad_norm": 2.164351224899292, + "learning_rate": 1e-06, + "loss": 1.084, + "mean_token_accuracy": 0.669581949710846, + "num_tokens": 219921116.0, + "step": 8680 + }, + { + "epoch": 0.9533274763891939, + "grad_norm": 1.9070484638214111, + "learning_rate": 1e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.7060151696205139, + "num_tokens": 219952808.0, + "step": 8681 + }, + { + "epoch": 0.9534372940918076, + "grad_norm": 2.2473151683807373, + "learning_rate": 1e-06, + "loss": 0.8345, + "mean_token_accuracy": 0.7400098443031311, + "num_tokens": 219978107.0, + "step": 8682 + }, + { + "epoch": 0.9535471117944213, + "grad_norm": 2.042910099029541, + "learning_rate": 1e-06, + "loss": 1.0101, + "mean_token_accuracy": 0.6903173923492432, + "num_tokens": 220006080.0, + "step": 8683 + }, + { + "epoch": 0.953656929497035, + "grad_norm": 2.1607441902160645, + "learning_rate": 1e-06, + "loss": 0.97, + "mean_token_accuracy": 0.7020523548126221, + "num_tokens": 220032692.0, + "step": 8684 + }, + { + "epoch": 0.9537667471996486, + "grad_norm": 2.067918539047241, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.7047780752182007, + "num_tokens": 220060076.0, + "step": 8685 + }, + { + "epoch": 0.9538765649022622, + "grad_norm": 2.0890419483184814, + "learning_rate": 1e-06, + "loss": 1.0148, + "mean_token_accuracy": 0.6950323581695557, + "num_tokens": 220088599.0, + "step": 8686 + }, + { + "epoch": 0.9539863826048759, + "grad_norm": 2.148660898208618, + "learning_rate": 1e-06, + "loss": 0.9274, + "mean_token_accuracy": 0.7121402025222778, + "num_tokens": 220114859.0, + "step": 8687 + }, + { + "epoch": 0.9540962003074895, + "grad_norm": 2.1790387630462646, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.700753927230835, + "num_tokens": 220140099.0, + "step": 8688 + }, + { + "epoch": 0.9542060180101032, + "grad_norm": 2.0427517890930176, + "learning_rate": 1e-06, + "loss": 0.9839, + "mean_token_accuracy": 0.6944611072540283, + "num_tokens": 220168102.0, + "step": 8689 + }, + { + "epoch": 0.9543158357127169, + "grad_norm": 2.5991313457489014, + "learning_rate": 1e-06, + "loss": 0.9375, + "mean_token_accuracy": 0.7101826667785645, + "num_tokens": 220187615.0, + "step": 8690 + }, + { + "epoch": 0.9544256534153306, + "grad_norm": 2.237654447555542, + "learning_rate": 1e-06, + "loss": 1.0211, + "mean_token_accuracy": 0.6971865892410278, + "num_tokens": 220211344.0, + "step": 8691 + }, + { + "epoch": 0.9545354711179442, + "grad_norm": 2.6795601844787598, + "learning_rate": 1e-06, + "loss": 0.8984, + "mean_token_accuracy": 0.7211130261421204, + "num_tokens": 220229323.0, + "step": 8692 + }, + { + "epoch": 0.9546452888205579, + "grad_norm": 2.5112524032592773, + "learning_rate": 1e-06, + "loss": 0.8374, + "mean_token_accuracy": 0.7375264167785645, + "num_tokens": 220247857.0, + "step": 8693 + }, + { + "epoch": 0.9547551065231715, + "grad_norm": 2.540545701980591, + "learning_rate": 1e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.7111455202102661, + "num_tokens": 220268603.0, + "step": 8694 + }, + { + "epoch": 0.9548649242257852, + "grad_norm": 2.2593252658843994, + "learning_rate": 1e-06, + "loss": 0.9309, + "mean_token_accuracy": 0.7112947702407837, + "num_tokens": 220293316.0, + "step": 8695 + }, + { + "epoch": 0.9549747419283988, + "grad_norm": 2.3202569484710693, + "learning_rate": 1e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.7195656299591064, + "num_tokens": 220315125.0, + "step": 8696 + }, + { + "epoch": 0.9550845596310125, + "grad_norm": 2.6012372970581055, + "learning_rate": 1e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.7176768183708191, + "num_tokens": 220334904.0, + "step": 8697 + }, + { + "epoch": 0.9551943773336262, + "grad_norm": 2.3721485137939453, + "learning_rate": 1e-06, + "loss": 0.8603, + "mean_token_accuracy": 0.7267307043075562, + "num_tokens": 220357334.0, + "step": 8698 + }, + { + "epoch": 0.9553041950362399, + "grad_norm": 2.1505725383758545, + "learning_rate": 1e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.6903235912322998, + "num_tokens": 220384702.0, + "step": 8699 + }, + { + "epoch": 0.9554140127388535, + "grad_norm": 2.3134028911590576, + "learning_rate": 1e-06, + "loss": 0.8803, + "mean_token_accuracy": 0.7328455448150635, + "num_tokens": 220408147.0, + "step": 8700 + }, + { + "epoch": 0.9555238304414672, + "grad_norm": 2.2476937770843506, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7127224206924438, + "num_tokens": 220429936.0, + "step": 8701 + }, + { + "epoch": 0.9556336481440808, + "grad_norm": 2.361342668533325, + "learning_rate": 1e-06, + "loss": 1.0591, + "mean_token_accuracy": 0.6959215402603149, + "num_tokens": 220453144.0, + "step": 8702 + }, + { + "epoch": 0.9557434658466945, + "grad_norm": 2.0841572284698486, + "learning_rate": 1e-06, + "loss": 1.0224, + "mean_token_accuracy": 0.6849468946456909, + "num_tokens": 220480323.0, + "step": 8703 + }, + { + "epoch": 0.9558532835493081, + "grad_norm": 2.159625291824341, + "learning_rate": 1e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.6940902471542358, + "num_tokens": 220508623.0, + "step": 8704 + }, + { + "epoch": 0.9559631012519219, + "grad_norm": 2.0962343215942383, + "learning_rate": 1e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.7190954685211182, + "num_tokens": 220536271.0, + "step": 8705 + }, + { + "epoch": 0.9560729189545355, + "grad_norm": 2.0599539279937744, + "learning_rate": 1e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.7169034481048584, + "num_tokens": 220562822.0, + "step": 8706 + }, + { + "epoch": 0.9561827366571491, + "grad_norm": 1.9776248931884766, + "learning_rate": 1e-06, + "loss": 0.9, + "mean_token_accuracy": 0.7230788469314575, + "num_tokens": 220591181.0, + "step": 8707 + }, + { + "epoch": 0.9562925543597628, + "grad_norm": 2.205270528793335, + "learning_rate": 1e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.7197140455245972, + "num_tokens": 220614750.0, + "step": 8708 + }, + { + "epoch": 0.9564023720623764, + "grad_norm": 1.9710276126861572, + "learning_rate": 1e-06, + "loss": 0.943, + "mean_token_accuracy": 0.7149906158447266, + "num_tokens": 220644757.0, + "step": 8709 + }, + { + "epoch": 0.9565121897649901, + "grad_norm": 2.1507604122161865, + "learning_rate": 1e-06, + "loss": 0.9034, + "mean_token_accuracy": 0.7179285287857056, + "num_tokens": 220669237.0, + "step": 8710 + }, + { + "epoch": 0.9566220074676037, + "grad_norm": 1.8943350315093994, + "learning_rate": 1e-06, + "loss": 1.0899, + "mean_token_accuracy": 0.6728401780128479, + "num_tokens": 220703976.0, + "step": 8711 + }, + { + "epoch": 0.9567318251702175, + "grad_norm": 2.176567316055298, + "learning_rate": 1e-06, + "loss": 0.8248, + "mean_token_accuracy": 0.7465351223945618, + "num_tokens": 220726137.0, + "step": 8712 + }, + { + "epoch": 0.9568416428728311, + "grad_norm": 2.3944003582000732, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7150411605834961, + "num_tokens": 220748131.0, + "step": 8713 + }, + { + "epoch": 0.9569514605754448, + "grad_norm": 2.1452884674072266, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.703673243522644, + "num_tokens": 220775175.0, + "step": 8714 + }, + { + "epoch": 0.9570612782780584, + "grad_norm": 2.266390323638916, + "learning_rate": 1e-06, + "loss": 0.882, + "mean_token_accuracy": 0.7259664535522461, + "num_tokens": 220798234.0, + "step": 8715 + }, + { + "epoch": 0.9571710959806721, + "grad_norm": 2.0043647289276123, + "learning_rate": 1e-06, + "loss": 0.8504, + "mean_token_accuracy": 0.7415634393692017, + "num_tokens": 220825558.0, + "step": 8716 + }, + { + "epoch": 0.9572809136832857, + "grad_norm": 2.17763352394104, + "learning_rate": 1e-06, + "loss": 0.9485, + "mean_token_accuracy": 0.7213815450668335, + "num_tokens": 220850614.0, + "step": 8717 + }, + { + "epoch": 0.9573907313858994, + "grad_norm": 2.445405960083008, + "learning_rate": 1e-06, + "loss": 0.9115, + "mean_token_accuracy": 0.7150108814239502, + "num_tokens": 220873638.0, + "step": 8718 + }, + { + "epoch": 0.9575005490885131, + "grad_norm": 2.254795551300049, + "learning_rate": 1e-06, + "loss": 0.9963, + "mean_token_accuracy": 0.6934462189674377, + "num_tokens": 220898059.0, + "step": 8719 + }, + { + "epoch": 0.9576103667911268, + "grad_norm": 1.9883739948272705, + "learning_rate": 1e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.709465503692627, + "num_tokens": 220925492.0, + "step": 8720 + }, + { + "epoch": 0.9577201844937404, + "grad_norm": 1.9712674617767334, + "learning_rate": 1e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.7109744548797607, + "num_tokens": 220952776.0, + "step": 8721 + }, + { + "epoch": 0.9578300021963541, + "grad_norm": 2.1307032108306885, + "learning_rate": 1e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.725466251373291, + "num_tokens": 220976140.0, + "step": 8722 + }, + { + "epoch": 0.9579398198989677, + "grad_norm": 2.2171247005462646, + "learning_rate": 1e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.7025046348571777, + "num_tokens": 221002853.0, + "step": 8723 + }, + { + "epoch": 0.9580496376015813, + "grad_norm": 1.9217112064361572, + "learning_rate": 1e-06, + "loss": 1.0483, + "mean_token_accuracy": 0.6825548410415649, + "num_tokens": 221036557.0, + "step": 8724 + }, + { + "epoch": 0.958159455304195, + "grad_norm": 2.2664573192596436, + "learning_rate": 1e-06, + "loss": 0.8743, + "mean_token_accuracy": 0.7252911925315857, + "num_tokens": 221058890.0, + "step": 8725 + }, + { + "epoch": 0.9582692730068088, + "grad_norm": 2.2460153102874756, + "learning_rate": 1e-06, + "loss": 1.0091, + "mean_token_accuracy": 0.6905918121337891, + "num_tokens": 221086465.0, + "step": 8726 + }, + { + "epoch": 0.9583790907094224, + "grad_norm": 2.073063373565674, + "learning_rate": 1e-06, + "loss": 1.0289, + "mean_token_accuracy": 0.686011016368866, + "num_tokens": 221114697.0, + "step": 8727 + }, + { + "epoch": 0.958488908412036, + "grad_norm": 2.1075143814086914, + "learning_rate": 1e-06, + "loss": 0.9286, + "mean_token_accuracy": 0.7256039977073669, + "num_tokens": 221139045.0, + "step": 8728 + }, + { + "epoch": 0.9585987261146497, + "grad_norm": 2.0767767429351807, + "learning_rate": 1e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.7220848202705383, + "num_tokens": 221166439.0, + "step": 8729 + }, + { + "epoch": 0.9587085438172633, + "grad_norm": 2.149190902709961, + "learning_rate": 1e-06, + "loss": 0.9504, + "mean_token_accuracy": 0.7048384547233582, + "num_tokens": 221194448.0, + "step": 8730 + }, + { + "epoch": 0.958818361519877, + "grad_norm": 2.0983176231384277, + "learning_rate": 1e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.7037167549133301, + "num_tokens": 221220486.0, + "step": 8731 + }, + { + "epoch": 0.9589281792224906, + "grad_norm": 2.2928266525268555, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7075374722480774, + "num_tokens": 221245382.0, + "step": 8732 + }, + { + "epoch": 0.9590379969251043, + "grad_norm": 2.06667160987854, + "learning_rate": 1e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.7144852876663208, + "num_tokens": 221274003.0, + "step": 8733 + }, + { + "epoch": 0.959147814627718, + "grad_norm": 2.1221466064453125, + "learning_rate": 1e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.7124624252319336, + "num_tokens": 221299042.0, + "step": 8734 + }, + { + "epoch": 0.9592576323303317, + "grad_norm": 2.2531731128692627, + "learning_rate": 1e-06, + "loss": 1.0065, + "mean_token_accuracy": 0.6996979713439941, + "num_tokens": 221322616.0, + "step": 8735 + }, + { + "epoch": 0.9593674500329453, + "grad_norm": 1.8833198547363281, + "learning_rate": 1e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.693474531173706, + "num_tokens": 221358057.0, + "step": 8736 + }, + { + "epoch": 0.959477267735559, + "grad_norm": 1.9223257303237915, + "learning_rate": 1e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.6993770003318787, + "num_tokens": 221389944.0, + "step": 8737 + }, + { + "epoch": 0.9595870854381726, + "grad_norm": 1.9285876750946045, + "learning_rate": 1e-06, + "loss": 0.8927, + "mean_token_accuracy": 0.7282298803329468, + "num_tokens": 221419713.0, + "step": 8738 + }, + { + "epoch": 0.9596969031407863, + "grad_norm": 1.8865550756454468, + "learning_rate": 1e-06, + "loss": 1.014, + "mean_token_accuracy": 0.6853166818618774, + "num_tokens": 221451360.0, + "step": 8739 + }, + { + "epoch": 0.9598067208433999, + "grad_norm": 2.32163405418396, + "learning_rate": 1e-06, + "loss": 0.8938, + "mean_token_accuracy": 0.7192478775978088, + "num_tokens": 221475095.0, + "step": 8740 + }, + { + "epoch": 0.9599165385460137, + "grad_norm": 2.479339361190796, + "learning_rate": 1e-06, + "loss": 0.8809, + "mean_token_accuracy": 0.7205216884613037, + "num_tokens": 221496092.0, + "step": 8741 + }, + { + "epoch": 0.9600263562486273, + "grad_norm": 2.2305705547332764, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.7064478397369385, + "num_tokens": 221520207.0, + "step": 8742 + }, + { + "epoch": 0.960136173951241, + "grad_norm": 2.239672899246216, + "learning_rate": 1e-06, + "loss": 0.9923, + "mean_token_accuracy": 0.7015686631202698, + "num_tokens": 221547074.0, + "step": 8743 + }, + { + "epoch": 0.9602459916538546, + "grad_norm": 2.2625882625579834, + "learning_rate": 1e-06, + "loss": 0.8621, + "mean_token_accuracy": 0.7269443273544312, + "num_tokens": 221568943.0, + "step": 8744 + }, + { + "epoch": 0.9603558093564682, + "grad_norm": 2.1785221099853516, + "learning_rate": 1e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.732512891292572, + "num_tokens": 221593366.0, + "step": 8745 + }, + { + "epoch": 0.9604656270590819, + "grad_norm": 2.4235963821411133, + "learning_rate": 1e-06, + "loss": 0.8653, + "mean_token_accuracy": 0.7278916835784912, + "num_tokens": 221612132.0, + "step": 8746 + }, + { + "epoch": 0.9605754447616955, + "grad_norm": 2.288203001022339, + "learning_rate": 1e-06, + "loss": 0.8768, + "mean_token_accuracy": 0.7229716181755066, + "num_tokens": 221636156.0, + "step": 8747 + }, + { + "epoch": 0.9606852624643093, + "grad_norm": 2.208160877227783, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.7022413015365601, + "num_tokens": 221662240.0, + "step": 8748 + }, + { + "epoch": 0.9607950801669229, + "grad_norm": 2.1851589679718018, + "learning_rate": 1e-06, + "loss": 0.9817, + "mean_token_accuracy": 0.7030817866325378, + "num_tokens": 221688458.0, + "step": 8749 + }, + { + "epoch": 0.9609048978695366, + "grad_norm": 2.319667100906372, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7149224281311035, + "num_tokens": 221710185.0, + "step": 8750 + }, + { + "epoch": 0.9610147155721502, + "grad_norm": 2.2903010845184326, + "learning_rate": 1e-06, + "loss": 0.9084, + "mean_token_accuracy": 0.718998908996582, + "num_tokens": 221732201.0, + "step": 8751 + }, + { + "epoch": 0.9611245332747639, + "grad_norm": 2.344785451889038, + "learning_rate": 1e-06, + "loss": 0.9939, + "mean_token_accuracy": 0.6959390044212341, + "num_tokens": 221755478.0, + "step": 8752 + }, + { + "epoch": 0.9612343509773775, + "grad_norm": 2.358891248703003, + "learning_rate": 1e-06, + "loss": 0.8787, + "mean_token_accuracy": 0.7274082899093628, + "num_tokens": 221776087.0, + "step": 8753 + }, + { + "epoch": 0.9613441686799912, + "grad_norm": 2.3919432163238525, + "learning_rate": 1e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7137508392333984, + "num_tokens": 221797609.0, + "step": 8754 + }, + { + "epoch": 0.9614539863826049, + "grad_norm": 2.2246720790863037, + "learning_rate": 1e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.7116556167602539, + "num_tokens": 221821076.0, + "step": 8755 + }, + { + "epoch": 0.9615638040852186, + "grad_norm": 2.0018301010131836, + "learning_rate": 1e-06, + "loss": 0.8162, + "mean_token_accuracy": 0.7500834465026855, + "num_tokens": 221847095.0, + "step": 8756 + }, + { + "epoch": 0.9616736217878322, + "grad_norm": 2.130603075027466, + "learning_rate": 1e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.7046769261360168, + "num_tokens": 221874078.0, + "step": 8757 + }, + { + "epoch": 0.9617834394904459, + "grad_norm": 2.102269172668457, + "learning_rate": 1e-06, + "loss": 1.0156, + "mean_token_accuracy": 0.6964071989059448, + "num_tokens": 221898515.0, + "step": 8758 + }, + { + "epoch": 0.9618932571930595, + "grad_norm": 2.1411097049713135, + "learning_rate": 1e-06, + "loss": 0.963, + "mean_token_accuracy": 0.6961538791656494, + "num_tokens": 221925516.0, + "step": 8759 + }, + { + "epoch": 0.9620030748956732, + "grad_norm": 1.9248806238174438, + "learning_rate": 1e-06, + "loss": 0.8821, + "mean_token_accuracy": 0.7242947220802307, + "num_tokens": 221953505.0, + "step": 8760 + }, + { + "epoch": 0.9621128925982868, + "grad_norm": 2.4261538982391357, + "learning_rate": 1e-06, + "loss": 0.9085, + "mean_token_accuracy": 0.7172894477844238, + "num_tokens": 221974355.0, + "step": 8761 + }, + { + "epoch": 0.9622227103009005, + "grad_norm": 1.9076571464538574, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7052246332168579, + "num_tokens": 222007054.0, + "step": 8762 + }, + { + "epoch": 0.9623325280035142, + "grad_norm": 2.150886297225952, + "learning_rate": 1e-06, + "loss": 1.0041, + "mean_token_accuracy": 0.6912710666656494, + "num_tokens": 222032810.0, + "step": 8763 + }, + { + "epoch": 0.9624423457061279, + "grad_norm": 2.2274680137634277, + "learning_rate": 1e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.7066864371299744, + "num_tokens": 222057030.0, + "step": 8764 + }, + { + "epoch": 0.9625521634087415, + "grad_norm": 2.165592670440674, + "learning_rate": 1e-06, + "loss": 0.8388, + "mean_token_accuracy": 0.7322149872779846, + "num_tokens": 222081331.0, + "step": 8765 + }, + { + "epoch": 0.9626619811113551, + "grad_norm": 2.3403921127319336, + "learning_rate": 1e-06, + "loss": 1.0062, + "mean_token_accuracy": 0.696882963180542, + "num_tokens": 222104509.0, + "step": 8766 + }, + { + "epoch": 0.9627717988139688, + "grad_norm": 2.3756046295166016, + "learning_rate": 1e-06, + "loss": 0.9282, + "mean_token_accuracy": 0.7238932847976685, + "num_tokens": 222126144.0, + "step": 8767 + }, + { + "epoch": 0.9628816165165824, + "grad_norm": 2.1287765502929688, + "learning_rate": 1e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.7036306262016296, + "num_tokens": 222152786.0, + "step": 8768 + }, + { + "epoch": 0.9629914342191961, + "grad_norm": 1.9051257371902466, + "learning_rate": 1e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7087490558624268, + "num_tokens": 222185051.0, + "step": 8769 + }, + { + "epoch": 0.9631012519218098, + "grad_norm": 2.0899312496185303, + "learning_rate": 1e-06, + "loss": 0.9193, + "mean_token_accuracy": 0.7165025472640991, + "num_tokens": 222212000.0, + "step": 8770 + }, + { + "epoch": 0.9632110696244235, + "grad_norm": 2.2326440811157227, + "learning_rate": 1e-06, + "loss": 1.0535, + "mean_token_accuracy": 0.6865882873535156, + "num_tokens": 222238297.0, + "step": 8771 + }, + { + "epoch": 0.9633208873270371, + "grad_norm": 2.0657873153686523, + "learning_rate": 1e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.7111905813217163, + "num_tokens": 222266970.0, + "step": 8772 + }, + { + "epoch": 0.9634307050296508, + "grad_norm": 2.2997984886169434, + "learning_rate": 1e-06, + "loss": 0.9386, + "mean_token_accuracy": 0.7115048766136169, + "num_tokens": 222289608.0, + "step": 8773 + }, + { + "epoch": 0.9635405227322644, + "grad_norm": 2.050368070602417, + "learning_rate": 1e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7090167999267578, + "num_tokens": 222317224.0, + "step": 8774 + }, + { + "epoch": 0.9636503404348781, + "grad_norm": 2.057957410812378, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7023599147796631, + "num_tokens": 222344884.0, + "step": 8775 + }, + { + "epoch": 0.9637601581374917, + "grad_norm": 2.196100950241089, + "learning_rate": 1e-06, + "loss": 0.9471, + "mean_token_accuracy": 0.7168401479721069, + "num_tokens": 222367687.0, + "step": 8776 + }, + { + "epoch": 0.9638699758401055, + "grad_norm": 2.368393659591675, + "learning_rate": 1e-06, + "loss": 0.9494, + "mean_token_accuracy": 0.7150589227676392, + "num_tokens": 222389968.0, + "step": 8777 + }, + { + "epoch": 0.9639797935427191, + "grad_norm": 2.4157521724700928, + "learning_rate": 1e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.7045896053314209, + "num_tokens": 222413785.0, + "step": 8778 + }, + { + "epoch": 0.9640896112453328, + "grad_norm": 2.37200927734375, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7184926271438599, + "num_tokens": 222435082.0, + "step": 8779 + }, + { + "epoch": 0.9641994289479464, + "grad_norm": 1.9384372234344482, + "learning_rate": 1e-06, + "loss": 1.0177, + "mean_token_accuracy": 0.6882370710372925, + "num_tokens": 222467357.0, + "step": 8780 + }, + { + "epoch": 0.9643092466505601, + "grad_norm": 1.88242506980896, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7102761268615723, + "num_tokens": 222498349.0, + "step": 8781 + }, + { + "epoch": 0.9644190643531737, + "grad_norm": 2.4940454959869385, + "learning_rate": 1e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.7149658203125, + "num_tokens": 222519220.0, + "step": 8782 + }, + { + "epoch": 0.9645288820557874, + "grad_norm": 2.5647077560424805, + "learning_rate": 1e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.702368974685669, + "num_tokens": 222540775.0, + "step": 8783 + }, + { + "epoch": 0.9646386997584011, + "grad_norm": 2.4608278274536133, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.716972827911377, + "num_tokens": 222561954.0, + "step": 8784 + }, + { + "epoch": 0.9647485174610148, + "grad_norm": 2.233579158782959, + "learning_rate": 1e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.7010752558708191, + "num_tokens": 222586502.0, + "step": 8785 + }, + { + "epoch": 0.9648583351636284, + "grad_norm": 2.2074661254882812, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7065314650535583, + "num_tokens": 222612829.0, + "step": 8786 + }, + { + "epoch": 0.964968152866242, + "grad_norm": 2.3181395530700684, + "learning_rate": 1e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.7047691941261292, + "num_tokens": 222636476.0, + "step": 8787 + }, + { + "epoch": 0.9650779705688557, + "grad_norm": 2.2662508487701416, + "learning_rate": 1e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.7144185900688171, + "num_tokens": 222658390.0, + "step": 8788 + }, + { + "epoch": 0.9651877882714693, + "grad_norm": 2.1694185733795166, + "learning_rate": 1e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7138633728027344, + "num_tokens": 222683099.0, + "step": 8789 + }, + { + "epoch": 0.965297605974083, + "grad_norm": 2.261733293533325, + "learning_rate": 1e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.7113708257675171, + "num_tokens": 222707048.0, + "step": 8790 + }, + { + "epoch": 0.9654074236766966, + "grad_norm": 2.4198575019836426, + "learning_rate": 1e-06, + "loss": 0.9498, + "mean_token_accuracy": 0.7073194980621338, + "num_tokens": 222728694.0, + "step": 8791 + }, + { + "epoch": 0.9655172413793104, + "grad_norm": 2.532777786254883, + "learning_rate": 1e-06, + "loss": 0.8456, + "mean_token_accuracy": 0.7473927736282349, + "num_tokens": 222747839.0, + "step": 8792 + }, + { + "epoch": 0.965627059081924, + "grad_norm": 2.362347364425659, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7092094421386719, + "num_tokens": 222770654.0, + "step": 8793 + }, + { + "epoch": 0.9657368767845377, + "grad_norm": 2.1201837062835693, + "learning_rate": 1e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.7118400931358337, + "num_tokens": 222798505.0, + "step": 8794 + }, + { + "epoch": 0.9658466944871513, + "grad_norm": 2.051931619644165, + "learning_rate": 1e-06, + "loss": 0.9196, + "mean_token_accuracy": 0.7115950584411621, + "num_tokens": 222826307.0, + "step": 8795 + }, + { + "epoch": 0.965956512189765, + "grad_norm": 2.1885437965393066, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.7030572295188904, + "num_tokens": 222853143.0, + "step": 8796 + }, + { + "epoch": 0.9660663298923786, + "grad_norm": 2.34000563621521, + "learning_rate": 1e-06, + "loss": 0.864, + "mean_token_accuracy": 0.734374463558197, + "num_tokens": 222874281.0, + "step": 8797 + }, + { + "epoch": 0.9661761475949923, + "grad_norm": 2.4715754985809326, + "learning_rate": 1e-06, + "loss": 0.8495, + "mean_token_accuracy": 0.7269020676612854, + "num_tokens": 222893370.0, + "step": 8798 + }, + { + "epoch": 0.966285965297606, + "grad_norm": 2.094080686569214, + "learning_rate": 1e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.7045458555221558, + "num_tokens": 222920566.0, + "step": 8799 + }, + { + "epoch": 0.9663957830002197, + "grad_norm": 2.083582878112793, + "learning_rate": 1e-06, + "loss": 0.8942, + "mean_token_accuracy": 0.7331018447875977, + "num_tokens": 222944926.0, + "step": 8800 + }, + { + "epoch": 0.9665056007028333, + "grad_norm": 2.352670907974243, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7158000469207764, + "num_tokens": 222968659.0, + "step": 8801 + }, + { + "epoch": 0.966615418405447, + "grad_norm": 2.1899983882904053, + "learning_rate": 1e-06, + "loss": 0.935, + "mean_token_accuracy": 0.7104274034500122, + "num_tokens": 222992970.0, + "step": 8802 + }, + { + "epoch": 0.9667252361080606, + "grad_norm": 2.470304250717163, + "learning_rate": 1e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7260438203811646, + "num_tokens": 223014449.0, + "step": 8803 + }, + { + "epoch": 0.9668350538106742, + "grad_norm": 2.519944667816162, + "learning_rate": 1e-06, + "loss": 0.8961, + "mean_token_accuracy": 0.7202174067497253, + "num_tokens": 223033688.0, + "step": 8804 + }, + { + "epoch": 0.9669448715132879, + "grad_norm": 2.271197557449341, + "learning_rate": 1e-06, + "loss": 0.8583, + "mean_token_accuracy": 0.728515088558197, + "num_tokens": 223055581.0, + "step": 8805 + }, + { + "epoch": 0.9670546892159017, + "grad_norm": 2.2271101474761963, + "learning_rate": 1e-06, + "loss": 0.8709, + "mean_token_accuracy": 0.7335770726203918, + "num_tokens": 223078668.0, + "step": 8806 + }, + { + "epoch": 0.9671645069185153, + "grad_norm": 2.0923733711242676, + "learning_rate": 1e-06, + "loss": 0.9477, + "mean_token_accuracy": 0.7061691284179688, + "num_tokens": 223105607.0, + "step": 8807 + }, + { + "epoch": 0.967274324621129, + "grad_norm": 2.07363224029541, + "learning_rate": 1e-06, + "loss": 1.019, + "mean_token_accuracy": 0.6889996528625488, + "num_tokens": 223135903.0, + "step": 8808 + }, + { + "epoch": 0.9673841423237426, + "grad_norm": 2.283130407333374, + "learning_rate": 1e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7110357284545898, + "num_tokens": 223159366.0, + "step": 8809 + }, + { + "epoch": 0.9674939600263562, + "grad_norm": 2.192833423614502, + "learning_rate": 1e-06, + "loss": 0.9058, + "mean_token_accuracy": 0.7173277735710144, + "num_tokens": 223182585.0, + "step": 8810 + }, + { + "epoch": 0.9676037777289699, + "grad_norm": 1.9948900938034058, + "learning_rate": 1e-06, + "loss": 0.8659, + "mean_token_accuracy": 0.7295393347740173, + "num_tokens": 223209863.0, + "step": 8811 + }, + { + "epoch": 0.9677135954315835, + "grad_norm": 2.5059564113616943, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7171493768692017, + "num_tokens": 223229646.0, + "step": 8812 + }, + { + "epoch": 0.9678234131341973, + "grad_norm": 2.137148857116699, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7146731615066528, + "num_tokens": 223257025.0, + "step": 8813 + }, + { + "epoch": 0.9679332308368109, + "grad_norm": 2.147326707839966, + "learning_rate": 1e-06, + "loss": 0.9039, + "mean_token_accuracy": 0.7289236783981323, + "num_tokens": 223283551.0, + "step": 8814 + }, + { + "epoch": 0.9680430485394246, + "grad_norm": 2.2977375984191895, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7093546986579895, + "num_tokens": 223308568.0, + "step": 8815 + }, + { + "epoch": 0.9681528662420382, + "grad_norm": 2.3427484035491943, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7244217395782471, + "num_tokens": 223330765.0, + "step": 8816 + }, + { + "epoch": 0.9682626839446519, + "grad_norm": 2.315725088119507, + "learning_rate": 1e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.7036740779876709, + "num_tokens": 223353029.0, + "step": 8817 + }, + { + "epoch": 0.9683725016472655, + "grad_norm": 2.1390180587768555, + "learning_rate": 1e-06, + "loss": 0.9032, + "mean_token_accuracy": 0.7233312129974365, + "num_tokens": 223379542.0, + "step": 8818 + }, + { + "epoch": 0.9684823193498792, + "grad_norm": 2.314129114151001, + "learning_rate": 1e-06, + "loss": 1.0562, + "mean_token_accuracy": 0.6895349025726318, + "num_tokens": 223404123.0, + "step": 8819 + }, + { + "epoch": 0.9685921370524928, + "grad_norm": 1.987245798110962, + "learning_rate": 1e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.717201828956604, + "num_tokens": 223431550.0, + "step": 8820 + }, + { + "epoch": 0.9687019547551066, + "grad_norm": 2.2654807567596436, + "learning_rate": 1e-06, + "loss": 1.0346, + "mean_token_accuracy": 0.6876918077468872, + "num_tokens": 223454749.0, + "step": 8821 + }, + { + "epoch": 0.9688117724577202, + "grad_norm": 2.157284736633301, + "learning_rate": 1e-06, + "loss": 0.8961, + "mean_token_accuracy": 0.7179614305496216, + "num_tokens": 223478642.0, + "step": 8822 + }, + { + "epoch": 0.9689215901603339, + "grad_norm": 2.24432635307312, + "learning_rate": 1e-06, + "loss": 0.8747, + "mean_token_accuracy": 0.7308396100997925, + "num_tokens": 223501408.0, + "step": 8823 + }, + { + "epoch": 0.9690314078629475, + "grad_norm": 2.641406774520874, + "learning_rate": 1e-06, + "loss": 0.908, + "mean_token_accuracy": 0.7226660847663879, + "num_tokens": 223519263.0, + "step": 8824 + }, + { + "epoch": 0.9691412255655611, + "grad_norm": 2.2277872562408447, + "learning_rate": 1e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.6941811442375183, + "num_tokens": 223544625.0, + "step": 8825 + }, + { + "epoch": 0.9692510432681748, + "grad_norm": 2.113600730895996, + "learning_rate": 1e-06, + "loss": 1.029, + "mean_token_accuracy": 0.6834803819656372, + "num_tokens": 223572304.0, + "step": 8826 + }, + { + "epoch": 0.9693608609707884, + "grad_norm": 2.203350067138672, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.715729832649231, + "num_tokens": 223596758.0, + "step": 8827 + }, + { + "epoch": 0.9694706786734022, + "grad_norm": 2.072017192840576, + "learning_rate": 1e-06, + "loss": 0.8223, + "mean_token_accuracy": 0.7472463846206665, + "num_tokens": 223621738.0, + "step": 8828 + }, + { + "epoch": 0.9695804963760158, + "grad_norm": 2.0128512382507324, + "learning_rate": 1e-06, + "loss": 0.87, + "mean_token_accuracy": 0.7258812189102173, + "num_tokens": 223650348.0, + "step": 8829 + }, + { + "epoch": 0.9696903140786295, + "grad_norm": 2.1160888671875, + "learning_rate": 1e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.7105213403701782, + "num_tokens": 223677130.0, + "step": 8830 + }, + { + "epoch": 0.9698001317812431, + "grad_norm": 2.260284662246704, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.6942332983016968, + "num_tokens": 223702975.0, + "step": 8831 + }, + { + "epoch": 0.9699099494838568, + "grad_norm": 2.066516399383545, + "learning_rate": 1e-06, + "loss": 1.0244, + "mean_token_accuracy": 0.6935313940048218, + "num_tokens": 223733960.0, + "step": 8832 + }, + { + "epoch": 0.9700197671864704, + "grad_norm": 2.507739543914795, + "learning_rate": 1e-06, + "loss": 0.9256, + "mean_token_accuracy": 0.7199706435203552, + "num_tokens": 223754470.0, + "step": 8833 + }, + { + "epoch": 0.9701295848890841, + "grad_norm": 2.1217799186706543, + "learning_rate": 1e-06, + "loss": 0.9029, + "mean_token_accuracy": 0.717748761177063, + "num_tokens": 223779152.0, + "step": 8834 + }, + { + "epoch": 0.9702394025916978, + "grad_norm": 2.3220460414886475, + "learning_rate": 1e-06, + "loss": 0.847, + "mean_token_accuracy": 0.7326349020004272, + "num_tokens": 223801530.0, + "step": 8835 + }, + { + "epoch": 0.9703492202943115, + "grad_norm": 2.0711119174957275, + "learning_rate": 1e-06, + "loss": 1.0024, + "mean_token_accuracy": 0.7039596438407898, + "num_tokens": 223830259.0, + "step": 8836 + }, + { + "epoch": 0.9704590379969251, + "grad_norm": 1.9073725938796997, + "learning_rate": 1e-06, + "loss": 1.0061, + "mean_token_accuracy": 0.7054661512374878, + "num_tokens": 223861330.0, + "step": 8837 + }, + { + "epoch": 0.9705688556995388, + "grad_norm": 2.069727659225464, + "learning_rate": 1e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.7249433994293213, + "num_tokens": 223887452.0, + "step": 8838 + }, + { + "epoch": 0.9706786734021524, + "grad_norm": 2.366713285446167, + "learning_rate": 1e-06, + "loss": 1.0216, + "mean_token_accuracy": 0.6916611194610596, + "num_tokens": 223911133.0, + "step": 8839 + }, + { + "epoch": 0.9707884911047661, + "grad_norm": 2.0664312839508057, + "learning_rate": 1e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.7090450525283813, + "num_tokens": 223938441.0, + "step": 8840 + }, + { + "epoch": 0.9708983088073797, + "grad_norm": 2.1728553771972656, + "learning_rate": 1e-06, + "loss": 0.9487, + "mean_token_accuracy": 0.7115264534950256, + "num_tokens": 223964025.0, + "step": 8841 + }, + { + "epoch": 0.9710081265099935, + "grad_norm": 2.1648030281066895, + "learning_rate": 1e-06, + "loss": 0.9627, + "mean_token_accuracy": 0.7024852633476257, + "num_tokens": 223988998.0, + "step": 8842 + }, + { + "epoch": 0.9711179442126071, + "grad_norm": 1.8988910913467407, + "learning_rate": 1e-06, + "loss": 1.0808, + "mean_token_accuracy": 0.6878662109375, + "num_tokens": 224023147.0, + "step": 8843 + }, + { + "epoch": 0.9712277619152208, + "grad_norm": 1.8754407167434692, + "learning_rate": 1e-06, + "loss": 1.036, + "mean_token_accuracy": 0.6931207776069641, + "num_tokens": 224056743.0, + "step": 8844 + }, + { + "epoch": 0.9713375796178344, + "grad_norm": 2.3000972270965576, + "learning_rate": 1e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.7123450040817261, + "num_tokens": 224079001.0, + "step": 8845 + }, + { + "epoch": 0.971447397320448, + "grad_norm": 2.249387741088867, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7037770748138428, + "num_tokens": 224103215.0, + "step": 8846 + }, + { + "epoch": 0.9715572150230617, + "grad_norm": 2.0789947509765625, + "learning_rate": 1e-06, + "loss": 1.0098, + "mean_token_accuracy": 0.69498610496521, + "num_tokens": 224130386.0, + "step": 8847 + }, + { + "epoch": 0.9716670327256753, + "grad_norm": 2.587939739227295, + "learning_rate": 1e-06, + "loss": 0.8852, + "mean_token_accuracy": 0.7224634885787964, + "num_tokens": 224149792.0, + "step": 8848 + }, + { + "epoch": 0.971776850428289, + "grad_norm": 1.967550277709961, + "learning_rate": 1e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.7193613052368164, + "num_tokens": 224177526.0, + "step": 8849 + }, + { + "epoch": 0.9718866681309027, + "grad_norm": 2.473149538040161, + "learning_rate": 1e-06, + "loss": 0.8952, + "mean_token_accuracy": 0.7250165939331055, + "num_tokens": 224196405.0, + "step": 8850 + }, + { + "epoch": 0.9719964858335164, + "grad_norm": 1.987928867340088, + "learning_rate": 1e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.7047547101974487, + "num_tokens": 224228745.0, + "step": 8851 + }, + { + "epoch": 0.97210630353613, + "grad_norm": 2.0300486087799072, + "learning_rate": 1e-06, + "loss": 0.9498, + "mean_token_accuracy": 0.6949681639671326, + "num_tokens": 224254301.0, + "step": 8852 + }, + { + "epoch": 0.9722161212387437, + "grad_norm": 1.9070628881454468, + "learning_rate": 1e-06, + "loss": 1.0875, + "mean_token_accuracy": 0.686118483543396, + "num_tokens": 224286627.0, + "step": 8853 + }, + { + "epoch": 0.9723259389413573, + "grad_norm": 2.3930485248565674, + "learning_rate": 1e-06, + "loss": 0.9233, + "mean_token_accuracy": 0.7174496650695801, + "num_tokens": 224307474.0, + "step": 8854 + }, + { + "epoch": 0.972435756643971, + "grad_norm": 2.320646286010742, + "learning_rate": 1e-06, + "loss": 0.8532, + "mean_token_accuracy": 0.7417521476745605, + "num_tokens": 224330618.0, + "step": 8855 + }, + { + "epoch": 0.9725455743465846, + "grad_norm": 2.171435594558716, + "learning_rate": 1e-06, + "loss": 0.8539, + "mean_token_accuracy": 0.7351531982421875, + "num_tokens": 224354557.0, + "step": 8856 + }, + { + "epoch": 0.9726553920491984, + "grad_norm": 1.8234251737594604, + "learning_rate": 1e-06, + "loss": 1.0548, + "mean_token_accuracy": 0.6890431046485901, + "num_tokens": 224389052.0, + "step": 8857 + }, + { + "epoch": 0.972765209751812, + "grad_norm": 2.1978228092193604, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7066349387168884, + "num_tokens": 224415220.0, + "step": 8858 + }, + { + "epoch": 0.9728750274544257, + "grad_norm": 2.2522990703582764, + "learning_rate": 1e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.7013810873031616, + "num_tokens": 224439468.0, + "step": 8859 + }, + { + "epoch": 0.9729848451570393, + "grad_norm": 2.26076078414917, + "learning_rate": 1e-06, + "loss": 0.9107, + "mean_token_accuracy": 0.7219576239585876, + "num_tokens": 224462856.0, + "step": 8860 + }, + { + "epoch": 0.973094662859653, + "grad_norm": 2.2344114780426025, + "learning_rate": 1e-06, + "loss": 0.9244, + "mean_token_accuracy": 0.7201747298240662, + "num_tokens": 224486211.0, + "step": 8861 + }, + { + "epoch": 0.9732044805622666, + "grad_norm": 2.120936393737793, + "learning_rate": 1e-06, + "loss": 1.0581, + "mean_token_accuracy": 0.6790375113487244, + "num_tokens": 224514567.0, + "step": 8862 + }, + { + "epoch": 0.9733142982648803, + "grad_norm": 2.4504785537719727, + "learning_rate": 1e-06, + "loss": 0.8776, + "mean_token_accuracy": 0.7284896373748779, + "num_tokens": 224534568.0, + "step": 8863 + }, + { + "epoch": 0.973424115967494, + "grad_norm": 2.6547577381134033, + "learning_rate": 1e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.7117196321487427, + "num_tokens": 224554240.0, + "step": 8864 + }, + { + "epoch": 0.9735339336701077, + "grad_norm": 2.177109718322754, + "learning_rate": 1e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.6947882771492004, + "num_tokens": 224579219.0, + "step": 8865 + }, + { + "epoch": 0.9736437513727213, + "grad_norm": 2.100870132446289, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7043551206588745, + "num_tokens": 224605034.0, + "step": 8866 + }, + { + "epoch": 0.973753569075335, + "grad_norm": 2.141786813735962, + "learning_rate": 1e-06, + "loss": 0.9997, + "mean_token_accuracy": 0.6947603225708008, + "num_tokens": 224632977.0, + "step": 8867 + }, + { + "epoch": 0.9738633867779486, + "grad_norm": 2.1899664402008057, + "learning_rate": 1e-06, + "loss": 1.0174, + "mean_token_accuracy": 0.6845269203186035, + "num_tokens": 224659116.0, + "step": 8868 + }, + { + "epoch": 0.9739732044805622, + "grad_norm": 2.2729690074920654, + "learning_rate": 1e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.7145611047744751, + "num_tokens": 224683455.0, + "step": 8869 + }, + { + "epoch": 0.9740830221831759, + "grad_norm": 2.3711817264556885, + "learning_rate": 1e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.7066730260848999, + "num_tokens": 224704047.0, + "step": 8870 + }, + { + "epoch": 0.9741928398857896, + "grad_norm": 2.3293163776397705, + "learning_rate": 1e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.7070218324661255, + "num_tokens": 224727908.0, + "step": 8871 + }, + { + "epoch": 0.9743026575884033, + "grad_norm": 2.271456718444824, + "learning_rate": 1e-06, + "loss": 0.8191, + "mean_token_accuracy": 0.7421350479125977, + "num_tokens": 224751279.0, + "step": 8872 + }, + { + "epoch": 0.9744124752910169, + "grad_norm": 2.165167808532715, + "learning_rate": 1e-06, + "loss": 1.0102, + "mean_token_accuracy": 0.6934005618095398, + "num_tokens": 224777624.0, + "step": 8873 + }, + { + "epoch": 0.9745222929936306, + "grad_norm": 2.2660980224609375, + "learning_rate": 1e-06, + "loss": 0.8939, + "mean_token_accuracy": 0.7186155319213867, + "num_tokens": 224800346.0, + "step": 8874 + }, + { + "epoch": 0.9746321106962442, + "grad_norm": 2.414555072784424, + "learning_rate": 1e-06, + "loss": 0.856, + "mean_token_accuracy": 0.7373586893081665, + "num_tokens": 224820784.0, + "step": 8875 + }, + { + "epoch": 0.9747419283988579, + "grad_norm": 2.2326159477233887, + "learning_rate": 1e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.7002501487731934, + "num_tokens": 224845416.0, + "step": 8876 + }, + { + "epoch": 0.9748517461014715, + "grad_norm": 2.0654001235961914, + "learning_rate": 1e-06, + "loss": 1.0084, + "mean_token_accuracy": 0.688583254814148, + "num_tokens": 224873364.0, + "step": 8877 + }, + { + "epoch": 0.9749615638040853, + "grad_norm": 2.4002132415771484, + "learning_rate": 1e-06, + "loss": 1.025, + "mean_token_accuracy": 0.6973535418510437, + "num_tokens": 224895425.0, + "step": 8878 + }, + { + "epoch": 0.9750713815066989, + "grad_norm": 2.5004916191101074, + "learning_rate": 1e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.7138069272041321, + "num_tokens": 224915616.0, + "step": 8879 + }, + { + "epoch": 0.9751811992093126, + "grad_norm": 2.333641767501831, + "learning_rate": 1e-06, + "loss": 0.9062, + "mean_token_accuracy": 0.7193400859832764, + "num_tokens": 224936786.0, + "step": 8880 + }, + { + "epoch": 0.9752910169119262, + "grad_norm": 2.041951894760132, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7146427631378174, + "num_tokens": 224964703.0, + "step": 8881 + }, + { + "epoch": 0.9754008346145399, + "grad_norm": 2.3801066875457764, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7031071782112122, + "num_tokens": 224988466.0, + "step": 8882 + }, + { + "epoch": 0.9755106523171535, + "grad_norm": 2.1020143032073975, + "learning_rate": 1e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.7103534936904907, + "num_tokens": 225015452.0, + "step": 8883 + }, + { + "epoch": 0.9756204700197671, + "grad_norm": 2.522399425506592, + "learning_rate": 1e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.7195857167243958, + "num_tokens": 225035207.0, + "step": 8884 + }, + { + "epoch": 0.9757302877223808, + "grad_norm": 2.3191728591918945, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7162092924118042, + "num_tokens": 225059386.0, + "step": 8885 + }, + { + "epoch": 0.9758401054249946, + "grad_norm": 2.236898899078369, + "learning_rate": 1e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.6938776969909668, + "num_tokens": 225083549.0, + "step": 8886 + }, + { + "epoch": 0.9759499231276082, + "grad_norm": 1.9208292961120605, + "learning_rate": 1e-06, + "loss": 1.0319, + "mean_token_accuracy": 0.6862027645111084, + "num_tokens": 225117476.0, + "step": 8887 + }, + { + "epoch": 0.9760597408302218, + "grad_norm": 2.0370075702667236, + "learning_rate": 1e-06, + "loss": 0.9853, + "mean_token_accuracy": 0.7119936943054199, + "num_tokens": 225146434.0, + "step": 8888 + }, + { + "epoch": 0.9761695585328355, + "grad_norm": 1.9611942768096924, + "learning_rate": 1e-06, + "loss": 1.0158, + "mean_token_accuracy": 0.6982476711273193, + "num_tokens": 225177303.0, + "step": 8889 + }, + { + "epoch": 0.9762793762354491, + "grad_norm": 2.0524024963378906, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.6966553926467896, + "num_tokens": 225207871.0, + "step": 8890 + }, + { + "epoch": 0.9763891939380628, + "grad_norm": 2.2706708908081055, + "learning_rate": 1e-06, + "loss": 0.8872, + "mean_token_accuracy": 0.7277937531471252, + "num_tokens": 225231232.0, + "step": 8891 + }, + { + "epoch": 0.9764990116406764, + "grad_norm": 2.050917863845825, + "learning_rate": 1e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.7201862335205078, + "num_tokens": 225258828.0, + "step": 8892 + }, + { + "epoch": 0.9766088293432902, + "grad_norm": 2.4290030002593994, + "learning_rate": 1e-06, + "loss": 0.8625, + "mean_token_accuracy": 0.7275750637054443, + "num_tokens": 225281440.0, + "step": 8893 + }, + { + "epoch": 0.9767186470459038, + "grad_norm": 2.121027946472168, + "learning_rate": 1e-06, + "loss": 0.9097, + "mean_token_accuracy": 0.7152993679046631, + "num_tokens": 225307785.0, + "step": 8894 + }, + { + "epoch": 0.9768284647485175, + "grad_norm": 2.26901912689209, + "learning_rate": 1e-06, + "loss": 0.8616, + "mean_token_accuracy": 0.7303072214126587, + "num_tokens": 225330148.0, + "step": 8895 + }, + { + "epoch": 0.9769382824511311, + "grad_norm": 2.1097705364227295, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7176449298858643, + "num_tokens": 225356252.0, + "step": 8896 + }, + { + "epoch": 0.9770481001537448, + "grad_norm": 2.042264461517334, + "learning_rate": 1e-06, + "loss": 0.8825, + "mean_token_accuracy": 0.7246389985084534, + "num_tokens": 225384686.0, + "step": 8897 + }, + { + "epoch": 0.9771579178563584, + "grad_norm": 1.991971492767334, + "learning_rate": 1e-06, + "loss": 0.9534, + "mean_token_accuracy": 0.7009937763214111, + "num_tokens": 225412771.0, + "step": 8898 + }, + { + "epoch": 0.9772677355589721, + "grad_norm": 2.1331522464752197, + "learning_rate": 1e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.7001016139984131, + "num_tokens": 225439376.0, + "step": 8899 + }, + { + "epoch": 0.9773775532615858, + "grad_norm": 2.234963893890381, + "learning_rate": 1e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.7112044095993042, + "num_tokens": 225463389.0, + "step": 8900 + }, + { + "epoch": 0.9774873709641995, + "grad_norm": 2.2857892513275146, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.7096196413040161, + "num_tokens": 225486458.0, + "step": 8901 + }, + { + "epoch": 0.9775971886668131, + "grad_norm": 2.1733076572418213, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.7137430906295776, + "num_tokens": 225512729.0, + "step": 8902 + }, + { + "epoch": 0.9777070063694268, + "grad_norm": 2.2505743503570557, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.703640341758728, + "num_tokens": 225537018.0, + "step": 8903 + }, + { + "epoch": 0.9778168240720404, + "grad_norm": 1.9242453575134277, + "learning_rate": 1e-06, + "loss": 0.8995, + "mean_token_accuracy": 0.7175483107566833, + "num_tokens": 225567419.0, + "step": 8904 + }, + { + "epoch": 0.977926641774654, + "grad_norm": 2.3853278160095215, + "learning_rate": 1e-06, + "loss": 0.9268, + "mean_token_accuracy": 0.7152289748191833, + "num_tokens": 225589458.0, + "step": 8905 + }, + { + "epoch": 0.9780364594772677, + "grad_norm": 2.297851800918579, + "learning_rate": 1e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.7081236839294434, + "num_tokens": 225612687.0, + "step": 8906 + }, + { + "epoch": 0.9781462771798815, + "grad_norm": 1.9244247674942017, + "learning_rate": 1e-06, + "loss": 1.0521, + "mean_token_accuracy": 0.6811636686325073, + "num_tokens": 225645115.0, + "step": 8907 + }, + { + "epoch": 0.9782560948824951, + "grad_norm": 2.886868953704834, + "learning_rate": 1e-06, + "loss": 0.9214, + "mean_token_accuracy": 0.7136208415031433, + "num_tokens": 225660563.0, + "step": 8908 + }, + { + "epoch": 0.9783659125851087, + "grad_norm": 2.003162145614624, + "learning_rate": 1e-06, + "loss": 0.9132, + "mean_token_accuracy": 0.7237339019775391, + "num_tokens": 225689553.0, + "step": 8909 + }, + { + "epoch": 0.9784757302877224, + "grad_norm": 2.21956729888916, + "learning_rate": 1e-06, + "loss": 0.8917, + "mean_token_accuracy": 0.7220450639724731, + "num_tokens": 225713158.0, + "step": 8910 + }, + { + "epoch": 0.978585547990336, + "grad_norm": 1.899848222732544, + "learning_rate": 1e-06, + "loss": 0.9858, + "mean_token_accuracy": 0.698211669921875, + "num_tokens": 225745195.0, + "step": 8911 + }, + { + "epoch": 0.9786953656929497, + "grad_norm": 2.1104800701141357, + "learning_rate": 1e-06, + "loss": 1.0212, + "mean_token_accuracy": 0.6845782995223999, + "num_tokens": 225773131.0, + "step": 8912 + }, + { + "epoch": 0.9788051833955633, + "grad_norm": 2.148425817489624, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.7143831253051758, + "num_tokens": 225798671.0, + "step": 8913 + }, + { + "epoch": 0.978915001098177, + "grad_norm": 2.172931432723999, + "learning_rate": 1e-06, + "loss": 0.872, + "mean_token_accuracy": 0.7357443571090698, + "num_tokens": 225822248.0, + "step": 8914 + }, + { + "epoch": 0.9790248188007907, + "grad_norm": 2.0088696479797363, + "learning_rate": 1e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.704571008682251, + "num_tokens": 225851354.0, + "step": 8915 + }, + { + "epoch": 0.9791346365034044, + "grad_norm": 2.264633893966675, + "learning_rate": 1e-06, + "loss": 1.0127, + "mean_token_accuracy": 0.6878306269645691, + "num_tokens": 225875913.0, + "step": 8916 + }, + { + "epoch": 0.979244454206018, + "grad_norm": 2.2409329414367676, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7007782459259033, + "num_tokens": 225901486.0, + "step": 8917 + }, + { + "epoch": 0.9793542719086317, + "grad_norm": 1.8447930812835693, + "learning_rate": 1e-06, + "loss": 0.8871, + "mean_token_accuracy": 0.732202410697937, + "num_tokens": 225934232.0, + "step": 8918 + }, + { + "epoch": 0.9794640896112453, + "grad_norm": 1.9113373756408691, + "learning_rate": 1e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.7025399208068848, + "num_tokens": 225966578.0, + "step": 8919 + }, + { + "epoch": 0.979573907313859, + "grad_norm": 2.0741214752197266, + "learning_rate": 1e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.7196241021156311, + "num_tokens": 225992791.0, + "step": 8920 + }, + { + "epoch": 0.9796837250164726, + "grad_norm": 2.206963062286377, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.7031681537628174, + "num_tokens": 226018636.0, + "step": 8921 + }, + { + "epoch": 0.9797935427190864, + "grad_norm": 2.162243366241455, + "learning_rate": 1e-06, + "loss": 0.9841, + "mean_token_accuracy": 0.7015885710716248, + "num_tokens": 226044470.0, + "step": 8922 + }, + { + "epoch": 0.9799033604217, + "grad_norm": 2.072824716567993, + "learning_rate": 1e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.6998096704483032, + "num_tokens": 226073268.0, + "step": 8923 + }, + { + "epoch": 0.9800131781243137, + "grad_norm": 2.0396018028259277, + "learning_rate": 1e-06, + "loss": 0.8788, + "mean_token_accuracy": 0.721468448638916, + "num_tokens": 226099978.0, + "step": 8924 + }, + { + "epoch": 0.9801229958269273, + "grad_norm": 2.286698341369629, + "learning_rate": 1e-06, + "loss": 0.8609, + "mean_token_accuracy": 0.7267839908599854, + "num_tokens": 226122109.0, + "step": 8925 + }, + { + "epoch": 0.980232813529541, + "grad_norm": 1.9494357109069824, + "learning_rate": 1e-06, + "loss": 0.9704, + "mean_token_accuracy": 0.7114585041999817, + "num_tokens": 226148561.0, + "step": 8926 + }, + { + "epoch": 0.9803426312321546, + "grad_norm": 2.108266830444336, + "learning_rate": 1e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.7125645279884338, + "num_tokens": 226173187.0, + "step": 8927 + }, + { + "epoch": 0.9804524489347682, + "grad_norm": 1.9948744773864746, + "learning_rate": 1e-06, + "loss": 0.9358, + "mean_token_accuracy": 0.7219609022140503, + "num_tokens": 226202605.0, + "step": 8928 + }, + { + "epoch": 0.980562266637382, + "grad_norm": 2.140685796737671, + "learning_rate": 1e-06, + "loss": 1.0366, + "mean_token_accuracy": 0.689111590385437, + "num_tokens": 226229325.0, + "step": 8929 + }, + { + "epoch": 0.9806720843399956, + "grad_norm": 2.3129515647888184, + "learning_rate": 1e-06, + "loss": 0.9768, + "mean_token_accuracy": 0.7001199722290039, + "num_tokens": 226251583.0, + "step": 8930 + }, + { + "epoch": 0.9807819020426093, + "grad_norm": 2.522864580154419, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7062642574310303, + "num_tokens": 226271625.0, + "step": 8931 + }, + { + "epoch": 0.9808917197452229, + "grad_norm": 2.2032530307769775, + "learning_rate": 1e-06, + "loss": 0.9504, + "mean_token_accuracy": 0.7058665752410889, + "num_tokens": 226294273.0, + "step": 8932 + }, + { + "epoch": 0.9810015374478366, + "grad_norm": 2.235010862350464, + "learning_rate": 1e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.7003236413002014, + "num_tokens": 226319100.0, + "step": 8933 + }, + { + "epoch": 0.9811113551504502, + "grad_norm": 2.222242593765259, + "learning_rate": 1e-06, + "loss": 1.0259, + "mean_token_accuracy": 0.6954153776168823, + "num_tokens": 226345451.0, + "step": 8934 + }, + { + "epoch": 0.9812211728530639, + "grad_norm": 2.2040445804595947, + "learning_rate": 1e-06, + "loss": 0.943, + "mean_token_accuracy": 0.7078840136528015, + "num_tokens": 226370956.0, + "step": 8935 + }, + { + "epoch": 0.9813309905556776, + "grad_norm": 2.2627198696136475, + "learning_rate": 1e-06, + "loss": 0.994, + "mean_token_accuracy": 0.696233332157135, + "num_tokens": 226395607.0, + "step": 8936 + }, + { + "epoch": 0.9814408082582913, + "grad_norm": 2.474963665008545, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7165460586547852, + "num_tokens": 226415953.0, + "step": 8937 + }, + { + "epoch": 0.9815506259609049, + "grad_norm": 2.5471725463867188, + "learning_rate": 1e-06, + "loss": 0.8858, + "mean_token_accuracy": 0.7285447716712952, + "num_tokens": 226435647.0, + "step": 8938 + }, + { + "epoch": 0.9816604436635186, + "grad_norm": 2.2503039836883545, + "learning_rate": 1e-06, + "loss": 1.0073, + "mean_token_accuracy": 0.6941719055175781, + "num_tokens": 226460625.0, + "step": 8939 + }, + { + "epoch": 0.9817702613661322, + "grad_norm": 2.5474631786346436, + "learning_rate": 1e-06, + "loss": 0.8735, + "mean_token_accuracy": 0.7269278764724731, + "num_tokens": 226479355.0, + "step": 8940 + }, + { + "epoch": 0.9818800790687459, + "grad_norm": 2.1219401359558105, + "learning_rate": 1e-06, + "loss": 0.8972, + "mean_token_accuracy": 0.7279393076896667, + "num_tokens": 226503644.0, + "step": 8941 + }, + { + "epoch": 0.9819898967713595, + "grad_norm": 2.2602477073669434, + "learning_rate": 1e-06, + "loss": 0.8618, + "mean_token_accuracy": 0.7325946092605591, + "num_tokens": 226526683.0, + "step": 8942 + }, + { + "epoch": 0.9820997144739732, + "grad_norm": 2.0342719554901123, + "learning_rate": 1e-06, + "loss": 1.0433, + "mean_token_accuracy": 0.6882375478744507, + "num_tokens": 226555393.0, + "step": 8943 + }, + { + "epoch": 0.9822095321765869, + "grad_norm": 2.6083669662475586, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7111344337463379, + "num_tokens": 226575565.0, + "step": 8944 + }, + { + "epoch": 0.9823193498792006, + "grad_norm": 2.0732734203338623, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.6997360587120056, + "num_tokens": 226605191.0, + "step": 8945 + }, + { + "epoch": 0.9824291675818142, + "grad_norm": 2.373730421066284, + "learning_rate": 1e-06, + "loss": 1.0256, + "mean_token_accuracy": 0.693999171257019, + "num_tokens": 226628978.0, + "step": 8946 + }, + { + "epoch": 0.9825389852844278, + "grad_norm": 2.4441661834716797, + "learning_rate": 1e-06, + "loss": 0.8163, + "mean_token_accuracy": 0.7409628629684448, + "num_tokens": 226647570.0, + "step": 8947 + }, + { + "epoch": 0.9826488029870415, + "grad_norm": 2.0716488361358643, + "learning_rate": 1e-06, + "loss": 0.9026, + "mean_token_accuracy": 0.7238470315933228, + "num_tokens": 226674219.0, + "step": 8948 + }, + { + "epoch": 0.9827586206896551, + "grad_norm": 2.2961559295654297, + "learning_rate": 1e-06, + "loss": 1.005, + "mean_token_accuracy": 0.6910364627838135, + "num_tokens": 226697958.0, + "step": 8949 + }, + { + "epoch": 0.9828684383922688, + "grad_norm": 2.3626487255096436, + "learning_rate": 1e-06, + "loss": 0.968, + "mean_token_accuracy": 0.7013317346572876, + "num_tokens": 226719844.0, + "step": 8950 + }, + { + "epoch": 0.9829782560948825, + "grad_norm": 1.9155415296554565, + "learning_rate": 1e-06, + "loss": 1.0086, + "mean_token_accuracy": 0.6953282952308655, + "num_tokens": 226751159.0, + "step": 8951 + }, + { + "epoch": 0.9830880737974962, + "grad_norm": 2.2137672901153564, + "learning_rate": 1e-06, + "loss": 0.8816, + "mean_token_accuracy": 0.7233763933181763, + "num_tokens": 226774286.0, + "step": 8952 + }, + { + "epoch": 0.9831978915001098, + "grad_norm": 2.415886402130127, + "learning_rate": 1e-06, + "loss": 0.915, + "mean_token_accuracy": 0.709301233291626, + "num_tokens": 226793348.0, + "step": 8953 + }, + { + "epoch": 0.9833077092027235, + "grad_norm": 2.0460305213928223, + "learning_rate": 1e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.716993510723114, + "num_tokens": 226820626.0, + "step": 8954 + }, + { + "epoch": 0.9834175269053371, + "grad_norm": 2.5718157291412354, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7043100595474243, + "num_tokens": 226841038.0, + "step": 8955 + }, + { + "epoch": 0.9835273446079508, + "grad_norm": 2.1355364322662354, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.7163362503051758, + "num_tokens": 226865373.0, + "step": 8956 + }, + { + "epoch": 0.9836371623105644, + "grad_norm": 2.326587200164795, + "learning_rate": 1e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.7052332162857056, + "num_tokens": 226887539.0, + "step": 8957 + }, + { + "epoch": 0.9837469800131782, + "grad_norm": 2.2088465690612793, + "learning_rate": 1e-06, + "loss": 0.9709, + "mean_token_accuracy": 0.7158128023147583, + "num_tokens": 226912987.0, + "step": 8958 + }, + { + "epoch": 0.9838567977157918, + "grad_norm": 2.380337715148926, + "learning_rate": 1e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.6980335712432861, + "num_tokens": 226937309.0, + "step": 8959 + }, + { + "epoch": 0.9839666154184055, + "grad_norm": 2.107020139694214, + "learning_rate": 1e-06, + "loss": 0.8538, + "mean_token_accuracy": 0.7369193434715271, + "num_tokens": 226961714.0, + "step": 8960 + }, + { + "epoch": 0.9840764331210191, + "grad_norm": 2.1246256828308105, + "learning_rate": 1e-06, + "loss": 0.9635, + "mean_token_accuracy": 0.712742269039154, + "num_tokens": 226987927.0, + "step": 8961 + }, + { + "epoch": 0.9841862508236328, + "grad_norm": 2.1522486209869385, + "learning_rate": 1e-06, + "loss": 0.892, + "mean_token_accuracy": 0.734477162361145, + "num_tokens": 227013223.0, + "step": 8962 + }, + { + "epoch": 0.9842960685262464, + "grad_norm": 2.623448371887207, + "learning_rate": 1e-06, + "loss": 0.9079, + "mean_token_accuracy": 0.7172027826309204, + "num_tokens": 227034228.0, + "step": 8963 + }, + { + "epoch": 0.98440588622886, + "grad_norm": 1.8893848657608032, + "learning_rate": 1e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.7064864635467529, + "num_tokens": 227067021.0, + "step": 8964 + }, + { + "epoch": 0.9845157039314738, + "grad_norm": 2.4121694564819336, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7090801000595093, + "num_tokens": 227088717.0, + "step": 8965 + }, + { + "epoch": 0.9846255216340875, + "grad_norm": 2.2380595207214355, + "learning_rate": 1e-06, + "loss": 0.9924, + "mean_token_accuracy": 0.6964120268821716, + "num_tokens": 227113151.0, + "step": 8966 + }, + { + "epoch": 0.9847353393367011, + "grad_norm": 1.8736324310302734, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7081177234649658, + "num_tokens": 227144942.0, + "step": 8967 + }, + { + "epoch": 0.9848451570393147, + "grad_norm": 2.0196094512939453, + "learning_rate": 1e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.7014546990394592, + "num_tokens": 227174683.0, + "step": 8968 + }, + { + "epoch": 0.9849549747419284, + "grad_norm": 1.9575845003128052, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7101225852966309, + "num_tokens": 227206993.0, + "step": 8969 + }, + { + "epoch": 0.985064792444542, + "grad_norm": 2.3184587955474854, + "learning_rate": 1e-06, + "loss": 0.9195, + "mean_token_accuracy": 0.7128359079360962, + "num_tokens": 227229567.0, + "step": 8970 + }, + { + "epoch": 0.9851746101471557, + "grad_norm": 2.105013608932495, + "learning_rate": 1e-06, + "loss": 0.8803, + "mean_token_accuracy": 0.7288137674331665, + "num_tokens": 227256712.0, + "step": 8971 + }, + { + "epoch": 0.9852844278497693, + "grad_norm": 2.315138578414917, + "learning_rate": 1e-06, + "loss": 1.0052, + "mean_token_accuracy": 0.693678081035614, + "num_tokens": 227280079.0, + "step": 8972 + }, + { + "epoch": 0.9853942455523831, + "grad_norm": 2.050853967666626, + "learning_rate": 1e-06, + "loss": 0.9029, + "mean_token_accuracy": 0.7189363241195679, + "num_tokens": 227309799.0, + "step": 8973 + }, + { + "epoch": 0.9855040632549967, + "grad_norm": 2.0261242389678955, + "learning_rate": 1e-06, + "loss": 1.0352, + "mean_token_accuracy": 0.6837913990020752, + "num_tokens": 227339190.0, + "step": 8974 + }, + { + "epoch": 0.9856138809576104, + "grad_norm": 2.1862661838531494, + "learning_rate": 1e-06, + "loss": 0.9789, + "mean_token_accuracy": 0.6946378946304321, + "num_tokens": 227366067.0, + "step": 8975 + }, + { + "epoch": 0.985723698660224, + "grad_norm": 2.2069175243377686, + "learning_rate": 1e-06, + "loss": 0.877, + "mean_token_accuracy": 0.7304172515869141, + "num_tokens": 227389960.0, + "step": 8976 + }, + { + "epoch": 0.9858335163628377, + "grad_norm": 2.4429914951324463, + "learning_rate": 1e-06, + "loss": 0.8917, + "mean_token_accuracy": 0.7171108722686768, + "num_tokens": 227409731.0, + "step": 8977 + }, + { + "epoch": 0.9859433340654513, + "grad_norm": 2.505854845046997, + "learning_rate": 1e-06, + "loss": 0.9774, + "mean_token_accuracy": 0.6964743137359619, + "num_tokens": 227429802.0, + "step": 8978 + }, + { + "epoch": 0.986053151768065, + "grad_norm": 2.000572443008423, + "learning_rate": 1e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.6936148405075073, + "num_tokens": 227461281.0, + "step": 8979 + }, + { + "epoch": 0.9861629694706787, + "grad_norm": 2.598048210144043, + "learning_rate": 1e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.71595299243927, + "num_tokens": 227479669.0, + "step": 8980 + }, + { + "epoch": 0.9862727871732924, + "grad_norm": 2.2667863368988037, + "learning_rate": 1e-06, + "loss": 0.9097, + "mean_token_accuracy": 0.7188733816146851, + "num_tokens": 227501629.0, + "step": 8981 + }, + { + "epoch": 0.986382604875906, + "grad_norm": 2.559173583984375, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.715908944606781, + "num_tokens": 227521499.0, + "step": 8982 + }, + { + "epoch": 0.9864924225785197, + "grad_norm": 1.9698269367218018, + "learning_rate": 1e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7225306034088135, + "num_tokens": 227551782.0, + "step": 8983 + }, + { + "epoch": 0.9866022402811333, + "grad_norm": 2.202652931213379, + "learning_rate": 1e-06, + "loss": 0.937, + "mean_token_accuracy": 0.7084828615188599, + "num_tokens": 227576630.0, + "step": 8984 + }, + { + "epoch": 0.986712057983747, + "grad_norm": 2.45896315574646, + "learning_rate": 1e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7161081433296204, + "num_tokens": 227598891.0, + "step": 8985 + }, + { + "epoch": 0.9868218756863606, + "grad_norm": 2.414018154144287, + "learning_rate": 1e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.7073289752006531, + "num_tokens": 227621398.0, + "step": 8986 + }, + { + "epoch": 0.9869316933889744, + "grad_norm": 2.4283621311187744, + "learning_rate": 1e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.7119350433349609, + "num_tokens": 227643620.0, + "step": 8987 + }, + { + "epoch": 0.987041511091588, + "grad_norm": 2.237718343734741, + "learning_rate": 1e-06, + "loss": 0.8793, + "mean_token_accuracy": 0.7270839810371399, + "num_tokens": 227668568.0, + "step": 8988 + }, + { + "epoch": 0.9871513287942016, + "grad_norm": 2.4319894313812256, + "learning_rate": 1e-06, + "loss": 1.0017, + "mean_token_accuracy": 0.6931992769241333, + "num_tokens": 227691299.0, + "step": 8989 + }, + { + "epoch": 0.9872611464968153, + "grad_norm": 2.7478649616241455, + "learning_rate": 1e-06, + "loss": 0.9127, + "mean_token_accuracy": 0.7135167121887207, + "num_tokens": 227709756.0, + "step": 8990 + }, + { + "epoch": 0.9873709641994289, + "grad_norm": 2.5431017875671387, + "learning_rate": 1e-06, + "loss": 0.862, + "mean_token_accuracy": 0.7269211411476135, + "num_tokens": 227729487.0, + "step": 8991 + }, + { + "epoch": 0.9874807819020426, + "grad_norm": 2.3364315032958984, + "learning_rate": 1e-06, + "loss": 0.8526, + "mean_token_accuracy": 0.7302765846252441, + "num_tokens": 227751044.0, + "step": 8992 + }, + { + "epoch": 0.9875905996046562, + "grad_norm": 2.100393295288086, + "learning_rate": 1e-06, + "loss": 1.0529, + "mean_token_accuracy": 0.6815491318702698, + "num_tokens": 227779580.0, + "step": 8993 + }, + { + "epoch": 0.98770041730727, + "grad_norm": 1.995317816734314, + "learning_rate": 1e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7183679342269897, + "num_tokens": 227809087.0, + "step": 8994 + }, + { + "epoch": 0.9878102350098836, + "grad_norm": 2.3520398139953613, + "learning_rate": 1e-06, + "loss": 0.7828, + "mean_token_accuracy": 0.7482320070266724, + "num_tokens": 227828940.0, + "step": 8995 + }, + { + "epoch": 0.9879200527124973, + "grad_norm": 2.2786853313446045, + "learning_rate": 1e-06, + "loss": 0.9567, + "mean_token_accuracy": 0.7105787396430969, + "num_tokens": 227854182.0, + "step": 8996 + }, + { + "epoch": 0.9880298704151109, + "grad_norm": 2.2345504760742188, + "learning_rate": 1e-06, + "loss": 1.0645, + "mean_token_accuracy": 0.6808943748474121, + "num_tokens": 227882720.0, + "step": 8997 + }, + { + "epoch": 0.9881396881177246, + "grad_norm": 2.153571128845215, + "learning_rate": 1e-06, + "loss": 0.9453, + "mean_token_accuracy": 0.7073750495910645, + "num_tokens": 227908007.0, + "step": 8998 + }, + { + "epoch": 0.9882495058203382, + "grad_norm": 2.610938549041748, + "learning_rate": 1e-06, + "loss": 0.8228, + "mean_token_accuracy": 0.7435657978057861, + "num_tokens": 227927834.0, + "step": 8999 + }, + { + "epoch": 0.9883593235229519, + "grad_norm": 2.1672587394714355, + "learning_rate": 1e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.7014108300209045, + "num_tokens": 227954851.0, + "step": 9000 + }, + { + "epoch": 0.9884691412255655, + "grad_norm": 2.0731303691864014, + "learning_rate": 1e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.7123510837554932, + "num_tokens": 227980788.0, + "step": 9001 + }, + { + "epoch": 0.9885789589281793, + "grad_norm": 2.3414714336395264, + "learning_rate": 1e-06, + "loss": 0.989, + "mean_token_accuracy": 0.7003447413444519, + "num_tokens": 228004254.0, + "step": 9002 + }, + { + "epoch": 0.9886887766307929, + "grad_norm": 1.9536464214324951, + "learning_rate": 1e-06, + "loss": 0.967, + "mean_token_accuracy": 0.7037613391876221, + "num_tokens": 228035196.0, + "step": 9003 + }, + { + "epoch": 0.9887985943334066, + "grad_norm": 2.286898136138916, + "learning_rate": 1e-06, + "loss": 0.8637, + "mean_token_accuracy": 0.7361140251159668, + "num_tokens": 228056911.0, + "step": 9004 + }, + { + "epoch": 0.9889084120360202, + "grad_norm": 1.9978445768356323, + "learning_rate": 1e-06, + "loss": 0.8493, + "mean_token_accuracy": 0.7373051643371582, + "num_tokens": 228084778.0, + "step": 9005 + }, + { + "epoch": 0.9890182297386338, + "grad_norm": 2.3545291423797607, + "learning_rate": 1e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.6954953670501709, + "num_tokens": 228105844.0, + "step": 9006 + }, + { + "epoch": 0.9891280474412475, + "grad_norm": 2.3697142601013184, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.7008836269378662, + "num_tokens": 228129619.0, + "step": 9007 + }, + { + "epoch": 0.9892378651438611, + "grad_norm": 2.4117588996887207, + "learning_rate": 1e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.6925771832466125, + "num_tokens": 228153902.0, + "step": 9008 + }, + { + "epoch": 0.9893476828464749, + "grad_norm": 2.166771173477173, + "learning_rate": 1e-06, + "loss": 1.0235, + "mean_token_accuracy": 0.6995934247970581, + "num_tokens": 228179267.0, + "step": 9009 + }, + { + "epoch": 0.9894575005490885, + "grad_norm": 2.569539785385132, + "learning_rate": 1e-06, + "loss": 0.8574, + "mean_token_accuracy": 0.7318888902664185, + "num_tokens": 228197319.0, + "step": 9010 + }, + { + "epoch": 0.9895673182517022, + "grad_norm": 2.183713674545288, + "learning_rate": 1e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.7013061046600342, + "num_tokens": 228223680.0, + "step": 9011 + }, + { + "epoch": 0.9896771359543158, + "grad_norm": 2.2376673221588135, + "learning_rate": 1e-06, + "loss": 0.9138, + "mean_token_accuracy": 0.7148334980010986, + "num_tokens": 228248528.0, + "step": 9012 + }, + { + "epoch": 0.9897869536569295, + "grad_norm": 2.1730096340179443, + "learning_rate": 1e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.7235032320022583, + "num_tokens": 228275054.0, + "step": 9013 + }, + { + "epoch": 0.9898967713595431, + "grad_norm": 2.214590549468994, + "learning_rate": 1e-06, + "loss": 0.9499, + "mean_token_accuracy": 0.7044723033905029, + "num_tokens": 228298479.0, + "step": 9014 + }, + { + "epoch": 0.9900065890621568, + "grad_norm": 2.1592366695404053, + "learning_rate": 1e-06, + "loss": 0.7911, + "mean_token_accuracy": 0.7515995502471924, + "num_tokens": 228322189.0, + "step": 9015 + }, + { + "epoch": 0.9901164067647705, + "grad_norm": 2.2227609157562256, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7046971321105957, + "num_tokens": 228346505.0, + "step": 9016 + }, + { + "epoch": 0.9902262244673842, + "grad_norm": 1.908617377281189, + "learning_rate": 1e-06, + "loss": 1.0782, + "mean_token_accuracy": 0.6735224723815918, + "num_tokens": 228380732.0, + "step": 9017 + }, + { + "epoch": 0.9903360421699978, + "grad_norm": 2.0897085666656494, + "learning_rate": 1e-06, + "loss": 0.884, + "mean_token_accuracy": 0.7244194746017456, + "num_tokens": 228408058.0, + "step": 9018 + }, + { + "epoch": 0.9904458598726115, + "grad_norm": 2.5307321548461914, + "learning_rate": 1e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.713567852973938, + "num_tokens": 228427002.0, + "step": 9019 + }, + { + "epoch": 0.9905556775752251, + "grad_norm": 2.2077932357788086, + "learning_rate": 1e-06, + "loss": 1.0391, + "mean_token_accuracy": 0.6828426122665405, + "num_tokens": 228454094.0, + "step": 9020 + }, + { + "epoch": 0.9906654952778388, + "grad_norm": 2.2398436069488525, + "learning_rate": 1e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7131708860397339, + "num_tokens": 228478965.0, + "step": 9021 + }, + { + "epoch": 0.9907753129804524, + "grad_norm": 2.1861965656280518, + "learning_rate": 1e-06, + "loss": 0.9206, + "mean_token_accuracy": 0.7242012023925781, + "num_tokens": 228504693.0, + "step": 9022 + }, + { + "epoch": 0.9908851306830662, + "grad_norm": 2.1217973232269287, + "learning_rate": 1e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.7185218334197998, + "num_tokens": 228530280.0, + "step": 9023 + }, + { + "epoch": 0.9909949483856798, + "grad_norm": 2.374394178390503, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.711018979549408, + "num_tokens": 228553575.0, + "step": 9024 + }, + { + "epoch": 0.9911047660882935, + "grad_norm": 1.956242561340332, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7076987028121948, + "num_tokens": 228582947.0, + "step": 9025 + }, + { + "epoch": 0.9912145837909071, + "grad_norm": 2.356086492538452, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7116230726242065, + "num_tokens": 228604223.0, + "step": 9026 + }, + { + "epoch": 0.9913244014935207, + "grad_norm": 2.624635696411133, + "learning_rate": 1e-06, + "loss": 0.8679, + "mean_token_accuracy": 0.719914972782135, + "num_tokens": 228622770.0, + "step": 9027 + }, + { + "epoch": 0.9914342191961344, + "grad_norm": 2.180468797683716, + "learning_rate": 1e-06, + "loss": 0.9836, + "mean_token_accuracy": 0.7099113464355469, + "num_tokens": 228649464.0, + "step": 9028 + }, + { + "epoch": 0.991544036898748, + "grad_norm": 2.102229595184326, + "learning_rate": 1e-06, + "loss": 1.0022, + "mean_token_accuracy": 0.700461208820343, + "num_tokens": 228676745.0, + "step": 9029 + }, + { + "epoch": 0.9916538546013618, + "grad_norm": 2.0005388259887695, + "learning_rate": 1e-06, + "loss": 1.0353, + "mean_token_accuracy": 0.6913964748382568, + "num_tokens": 228708454.0, + "step": 9030 + }, + { + "epoch": 0.9917636723039754, + "grad_norm": 2.6477060317993164, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.7194189429283142, + "num_tokens": 228728266.0, + "step": 9031 + }, + { + "epoch": 0.9918734900065891, + "grad_norm": 2.275336265563965, + "learning_rate": 1e-06, + "loss": 0.8523, + "mean_token_accuracy": 0.732440710067749, + "num_tokens": 228749735.0, + "step": 9032 + }, + { + "epoch": 0.9919833077092027, + "grad_norm": 2.0754363536834717, + "learning_rate": 1e-06, + "loss": 0.834, + "mean_token_accuracy": 0.7350759506225586, + "num_tokens": 228774776.0, + "step": 9033 + }, + { + "epoch": 0.9920931254118164, + "grad_norm": 2.0996954441070557, + "learning_rate": 1e-06, + "loss": 0.8962, + "mean_token_accuracy": 0.7301638722419739, + "num_tokens": 228804053.0, + "step": 9034 + }, + { + "epoch": 0.99220294311443, + "grad_norm": 2.2472431659698486, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7142283916473389, + "num_tokens": 228828101.0, + "step": 9035 + }, + { + "epoch": 0.9923127608170437, + "grad_norm": 2.329219102859497, + "learning_rate": 1e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.7092632055282593, + "num_tokens": 228850728.0, + "step": 9036 + }, + { + "epoch": 0.9924225785196573, + "grad_norm": 2.2808339595794678, + "learning_rate": 1e-06, + "loss": 0.8919, + "mean_token_accuracy": 0.7237508296966553, + "num_tokens": 228874452.0, + "step": 9037 + }, + { + "epoch": 0.9925323962222711, + "grad_norm": 2.1964895725250244, + "learning_rate": 1e-06, + "loss": 0.8647, + "mean_token_accuracy": 0.7272082567214966, + "num_tokens": 228898423.0, + "step": 9038 + }, + { + "epoch": 0.9926422139248847, + "grad_norm": 2.2437140941619873, + "learning_rate": 1e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7256062626838684, + "num_tokens": 228921370.0, + "step": 9039 + }, + { + "epoch": 0.9927520316274984, + "grad_norm": 2.052757978439331, + "learning_rate": 1e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.7161571383476257, + "num_tokens": 228948904.0, + "step": 9040 + }, + { + "epoch": 0.992861849330112, + "grad_norm": 2.1341137886047363, + "learning_rate": 1e-06, + "loss": 1.0231, + "mean_token_accuracy": 0.7052221894264221, + "num_tokens": 228976615.0, + "step": 9041 + }, + { + "epoch": 0.9929716670327257, + "grad_norm": 2.400921106338501, + "learning_rate": 1e-06, + "loss": 0.9352, + "mean_token_accuracy": 0.7060531377792358, + "num_tokens": 228998022.0, + "step": 9042 + }, + { + "epoch": 0.9930814847353393, + "grad_norm": 1.7882152795791626, + "learning_rate": 1e-06, + "loss": 1.001, + "mean_token_accuracy": 0.691615879535675, + "num_tokens": 229034469.0, + "step": 9043 + }, + { + "epoch": 0.993191302437953, + "grad_norm": 2.4732940196990967, + "learning_rate": 1e-06, + "loss": 0.8747, + "mean_token_accuracy": 0.730269193649292, + "num_tokens": 229054696.0, + "step": 9044 + }, + { + "epoch": 0.9933011201405667, + "grad_norm": 2.021113157272339, + "learning_rate": 1e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.6931280493736267, + "num_tokens": 229083074.0, + "step": 9045 + }, + { + "epoch": 0.9934109378431804, + "grad_norm": 2.011997699737549, + "learning_rate": 1e-06, + "loss": 0.8836, + "mean_token_accuracy": 0.7267733812332153, + "num_tokens": 229109708.0, + "step": 9046 + }, + { + "epoch": 0.993520755545794, + "grad_norm": 2.605530261993408, + "learning_rate": 1e-06, + "loss": 0.8188, + "mean_token_accuracy": 0.7389799952507019, + "num_tokens": 229129643.0, + "step": 9047 + }, + { + "epoch": 0.9936305732484076, + "grad_norm": 2.216566562652588, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.714994490146637, + "num_tokens": 229155480.0, + "step": 9048 + }, + { + "epoch": 0.9937403909510213, + "grad_norm": 2.2499160766601562, + "learning_rate": 1e-06, + "loss": 0.889, + "mean_token_accuracy": 0.722687840461731, + "num_tokens": 229179905.0, + "step": 9049 + }, + { + "epoch": 0.9938502086536349, + "grad_norm": 2.0312740802764893, + "learning_rate": 1e-06, + "loss": 0.8695, + "mean_token_accuracy": 0.7302148938179016, + "num_tokens": 229208755.0, + "step": 9050 + }, + { + "epoch": 0.9939600263562486, + "grad_norm": 2.1218106746673584, + "learning_rate": 1e-06, + "loss": 0.9462, + "mean_token_accuracy": 0.7045320272445679, + "num_tokens": 229233153.0, + "step": 9051 + }, + { + "epoch": 0.9940698440588623, + "grad_norm": 2.2234296798706055, + "learning_rate": 1e-06, + "loss": 0.8794, + "mean_token_accuracy": 0.7273567318916321, + "num_tokens": 229256994.0, + "step": 9052 + }, + { + "epoch": 0.994179661761476, + "grad_norm": 2.180943489074707, + "learning_rate": 1e-06, + "loss": 0.9449, + "mean_token_accuracy": 0.706000030040741, + "num_tokens": 229285793.0, + "step": 9053 + }, + { + "epoch": 0.9942894794640896, + "grad_norm": 2.3483643531799316, + "learning_rate": 1e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.7042334675788879, + "num_tokens": 229307704.0, + "step": 9054 + }, + { + "epoch": 0.9943992971667033, + "grad_norm": 2.3070485591888428, + "learning_rate": 1e-06, + "loss": 0.9236, + "mean_token_accuracy": 0.7174665927886963, + "num_tokens": 229330482.0, + "step": 9055 + }, + { + "epoch": 0.9945091148693169, + "grad_norm": 2.2833218574523926, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7074492573738098, + "num_tokens": 229354327.0, + "step": 9056 + }, + { + "epoch": 0.9946189325719306, + "grad_norm": 2.4702608585357666, + "learning_rate": 1e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.7000893950462341, + "num_tokens": 229374482.0, + "step": 9057 + }, + { + "epoch": 0.9947287502745442, + "grad_norm": 2.3403160572052, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.7003657817840576, + "num_tokens": 229400384.0, + "step": 9058 + }, + { + "epoch": 0.994838567977158, + "grad_norm": 2.2906177043914795, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7081718444824219, + "num_tokens": 229423685.0, + "step": 9059 + }, + { + "epoch": 0.9949483856797716, + "grad_norm": 2.247365951538086, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.7089056968688965, + "num_tokens": 229448660.0, + "step": 9060 + }, + { + "epoch": 0.9950582033823853, + "grad_norm": 2.048863172531128, + "learning_rate": 1e-06, + "loss": 0.8684, + "mean_token_accuracy": 0.7342281937599182, + "num_tokens": 229473486.0, + "step": 9061 + }, + { + "epoch": 0.9951680210849989, + "grad_norm": 1.8565173149108887, + "learning_rate": 1e-06, + "loss": 0.9088, + "mean_token_accuracy": 0.7211626172065735, + "num_tokens": 229505524.0, + "step": 9062 + }, + { + "epoch": 0.9952778387876126, + "grad_norm": 2.559913396835327, + "learning_rate": 1e-06, + "loss": 0.979, + "mean_token_accuracy": 0.7042436003684998, + "num_tokens": 229524896.0, + "step": 9063 + }, + { + "epoch": 0.9953876564902262, + "grad_norm": 2.1244516372680664, + "learning_rate": 1e-06, + "loss": 0.93, + "mean_token_accuracy": 0.7116739749908447, + "num_tokens": 229550968.0, + "step": 9064 + }, + { + "epoch": 0.9954974741928398, + "grad_norm": 2.232618808746338, + "learning_rate": 1e-06, + "loss": 0.9203, + "mean_token_accuracy": 0.7184910774230957, + "num_tokens": 229574115.0, + "step": 9065 + }, + { + "epoch": 0.9956072918954535, + "grad_norm": 2.1592628955841064, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.700173020362854, + "num_tokens": 229599913.0, + "step": 9066 + }, + { + "epoch": 0.9957171095980673, + "grad_norm": 2.447390079498291, + "learning_rate": 1e-06, + "loss": 0.8449, + "mean_token_accuracy": 0.732723593711853, + "num_tokens": 229618998.0, + "step": 9067 + }, + { + "epoch": 0.9958269273006809, + "grad_norm": 2.0323712825775146, + "learning_rate": 1e-06, + "loss": 0.9979, + "mean_token_accuracy": 0.6925623416900635, + "num_tokens": 229650531.0, + "step": 9068 + }, + { + "epoch": 0.9959367450032945, + "grad_norm": 2.193634033203125, + "learning_rate": 1e-06, + "loss": 0.8713, + "mean_token_accuracy": 0.7257827520370483, + "num_tokens": 229674318.0, + "step": 9069 + }, + { + "epoch": 0.9960465627059082, + "grad_norm": 2.032841444015503, + "learning_rate": 1e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7160888910293579, + "num_tokens": 229702646.0, + "step": 9070 + }, + { + "epoch": 0.9961563804085218, + "grad_norm": 2.2397148609161377, + "learning_rate": 1e-06, + "loss": 0.9704, + "mean_token_accuracy": 0.7055737972259521, + "num_tokens": 229727252.0, + "step": 9071 + }, + { + "epoch": 0.9962661981111355, + "grad_norm": 2.259160041809082, + "learning_rate": 1e-06, + "loss": 0.9253, + "mean_token_accuracy": 0.717362642288208, + "num_tokens": 229749957.0, + "step": 9072 + }, + { + "epoch": 0.9963760158137491, + "grad_norm": 2.3596999645233154, + "learning_rate": 1e-06, + "loss": 0.8885, + "mean_token_accuracy": 0.7209625244140625, + "num_tokens": 229771723.0, + "step": 9073 + }, + { + "epoch": 0.9964858335163629, + "grad_norm": 2.310407876968384, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.7028727531433105, + "num_tokens": 229796449.0, + "step": 9074 + }, + { + "epoch": 0.9965956512189765, + "grad_norm": 2.2866663932800293, + "learning_rate": 1e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7154849767684937, + "num_tokens": 229822268.0, + "step": 9075 + }, + { + "epoch": 0.9967054689215902, + "grad_norm": 2.6836698055267334, + "learning_rate": 1e-06, + "loss": 0.9096, + "mean_token_accuracy": 0.7171022891998291, + "num_tokens": 229840924.0, + "step": 9076 + }, + { + "epoch": 0.9968152866242038, + "grad_norm": 2.3415510654449463, + "learning_rate": 1e-06, + "loss": 0.9042, + "mean_token_accuracy": 0.7224639654159546, + "num_tokens": 229863011.0, + "step": 9077 + }, + { + "epoch": 0.9969251043268175, + "grad_norm": 2.16374135017395, + "learning_rate": 1e-06, + "loss": 0.9206, + "mean_token_accuracy": 0.7104436159133911, + "num_tokens": 229887293.0, + "step": 9078 + }, + { + "epoch": 0.9970349220294311, + "grad_norm": 2.1413931846618652, + "learning_rate": 1e-06, + "loss": 0.9926, + "mean_token_accuracy": 0.6989889144897461, + "num_tokens": 229915016.0, + "step": 9079 + }, + { + "epoch": 0.9971447397320448, + "grad_norm": 2.3188254833221436, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7014827132225037, + "num_tokens": 229938828.0, + "step": 9080 + }, + { + "epoch": 0.9972545574346585, + "grad_norm": 2.257969856262207, + "learning_rate": 1e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.7191659212112427, + "num_tokens": 229965604.0, + "step": 9081 + }, + { + "epoch": 0.9973643751372722, + "grad_norm": 2.107790231704712, + "learning_rate": 1e-06, + "loss": 0.8997, + "mean_token_accuracy": 0.7231177687644958, + "num_tokens": 229991920.0, + "step": 9082 + }, + { + "epoch": 0.9974741928398858, + "grad_norm": 2.073012351989746, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.7106850743293762, + "num_tokens": 230018235.0, + "step": 9083 + }, + { + "epoch": 0.9975840105424995, + "grad_norm": 2.2003841400146484, + "learning_rate": 1e-06, + "loss": 0.8442, + "mean_token_accuracy": 0.7275002002716064, + "num_tokens": 230041821.0, + "step": 9084 + }, + { + "epoch": 0.9976938282451131, + "grad_norm": 2.015000343322754, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7048851847648621, + "num_tokens": 230069943.0, + "step": 9085 + }, + { + "epoch": 0.9978036459477267, + "grad_norm": 2.1279423236846924, + "learning_rate": 1e-06, + "loss": 0.9586, + "mean_token_accuracy": 0.7016831636428833, + "num_tokens": 230095762.0, + "step": 9086 + }, + { + "epoch": 0.9979134636503404, + "grad_norm": 2.3852157592773438, + "learning_rate": 1e-06, + "loss": 0.7817, + "mean_token_accuracy": 0.7499554753303528, + "num_tokens": 230115086.0, + "step": 9087 + }, + { + "epoch": 0.9980232813529542, + "grad_norm": 2.5321238040924072, + "learning_rate": 1e-06, + "loss": 0.874, + "mean_token_accuracy": 0.731611430644989, + "num_tokens": 230134121.0, + "step": 9088 + }, + { + "epoch": 0.9981330990555678, + "grad_norm": 2.1950950622558594, + "learning_rate": 1e-06, + "loss": 0.9748, + "mean_token_accuracy": 0.7048250436782837, + "num_tokens": 230157952.0, + "step": 9089 + }, + { + "epoch": 0.9982429167581814, + "grad_norm": 2.293426990509033, + "learning_rate": 1e-06, + "loss": 0.8992, + "mean_token_accuracy": 0.7239295840263367, + "num_tokens": 230179777.0, + "step": 9090 + }, + { + "epoch": 0.9983527344607951, + "grad_norm": 2.1881778240203857, + "learning_rate": 1e-06, + "loss": 0.9497, + "mean_token_accuracy": 0.7067978978157043, + "num_tokens": 230204515.0, + "step": 9091 + }, + { + "epoch": 0.9984625521634087, + "grad_norm": 2.431931495666504, + "learning_rate": 1e-06, + "loss": 0.967, + "mean_token_accuracy": 0.7115397453308105, + "num_tokens": 230225382.0, + "step": 9092 + }, + { + "epoch": 0.9985723698660224, + "grad_norm": 2.4256136417388916, + "learning_rate": 1e-06, + "loss": 0.8459, + "mean_token_accuracy": 0.7334274053573608, + "num_tokens": 230247316.0, + "step": 9093 + }, + { + "epoch": 0.998682187568636, + "grad_norm": 2.1726253032684326, + "learning_rate": 1e-06, + "loss": 1.0354, + "mean_token_accuracy": 0.6864417791366577, + "num_tokens": 230274244.0, + "step": 9094 + }, + { + "epoch": 0.9987920052712497, + "grad_norm": 2.524359703063965, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7071722149848938, + "num_tokens": 230295739.0, + "step": 9095 + }, + { + "epoch": 0.9989018229738634, + "grad_norm": 2.1806464195251465, + "learning_rate": 1e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7071422338485718, + "num_tokens": 230318127.0, + "step": 9096 + }, + { + "epoch": 0.9990116406764771, + "grad_norm": 1.7952945232391357, + "learning_rate": 1e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.6970728039741516, + "num_tokens": 230352626.0, + "step": 9097 + }, + { + "epoch": 0.9991214583790907, + "grad_norm": 2.1976687908172607, + "learning_rate": 1e-06, + "loss": 0.9909, + "mean_token_accuracy": 0.6949440240859985, + "num_tokens": 230378945.0, + "step": 9098 + }, + { + "epoch": 0.9992312760817044, + "grad_norm": 2.155771255493164, + "learning_rate": 1e-06, + "loss": 1.0439, + "mean_token_accuracy": 0.6884879469871521, + "num_tokens": 230405385.0, + "step": 9099 + }, + { + "epoch": 0.999341093784318, + "grad_norm": 2.2427639961242676, + "learning_rate": 1e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.7199592590332031, + "num_tokens": 230431320.0, + "step": 9100 + }, + { + "epoch": 0.9994509114869317, + "grad_norm": 2.0692405700683594, + "learning_rate": 1e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.6821461915969849, + "num_tokens": 230460894.0, + "step": 9101 + }, + { + "epoch": 0.9995607291895453, + "grad_norm": 2.394057512283325, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.7029955387115479, + "num_tokens": 230482619.0, + "step": 9102 + }, + { + "epoch": 0.9996705468921591, + "grad_norm": 2.1193604469299316, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.7124769687652588, + "num_tokens": 230509353.0, + "step": 9103 + }, + { + "epoch": 0.9997803645947727, + "grad_norm": 1.933107614517212, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7079575061798096, + "num_tokens": 230537640.0, + "step": 9104 + }, + { + "epoch": 0.9998901822973864, + "grad_norm": 2.2447257041931152, + "learning_rate": 1e-06, + "loss": 0.8782, + "mean_token_accuracy": 0.7278924584388733, + "num_tokens": 230560650.0, + "step": 9105 + }, + { + "epoch": 1.0, + "grad_norm": 2.1138598918914795, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7065436244010925, + "num_tokens": 230586928.0, + "step": 9106 + }, + { + "epoch": 1.0001098177026138, + "grad_norm": 2.18992280960083, + "learning_rate": 1e-06, + "loss": 0.9603, + "mean_token_accuracy": 0.7060664892196655, + "num_tokens": 230610721.0, + "step": 9107 + }, + { + "epoch": 1.0002196354052273, + "grad_norm": 2.2641453742980957, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7015529870986938, + "num_tokens": 230633578.0, + "step": 9108 + }, + { + "epoch": 1.000329453107841, + "grad_norm": 2.16428804397583, + "learning_rate": 1e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.7275726199150085, + "num_tokens": 230659540.0, + "step": 9109 + }, + { + "epoch": 1.0004392708104546, + "grad_norm": 2.1561577320098877, + "learning_rate": 1e-06, + "loss": 0.8731, + "mean_token_accuracy": 0.7324700355529785, + "num_tokens": 230683654.0, + "step": 9110 + }, + { + "epoch": 1.0005490885130683, + "grad_norm": 2.4460530281066895, + "learning_rate": 1e-06, + "loss": 0.8947, + "mean_token_accuracy": 0.7198779582977295, + "num_tokens": 230703306.0, + "step": 9111 + }, + { + "epoch": 1.0006589062156819, + "grad_norm": 2.1352298259735107, + "learning_rate": 1e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.7133848667144775, + "num_tokens": 230728955.0, + "step": 9112 + }, + { + "epoch": 1.0007687239182956, + "grad_norm": 2.189063549041748, + "learning_rate": 1e-06, + "loss": 0.8976, + "mean_token_accuracy": 0.7236534357070923, + "num_tokens": 230753783.0, + "step": 9113 + }, + { + "epoch": 1.0008785416209094, + "grad_norm": 1.9972703456878662, + "learning_rate": 1e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.7135746479034424, + "num_tokens": 230782226.0, + "step": 9114 + }, + { + "epoch": 1.000988359323523, + "grad_norm": 2.1000733375549316, + "learning_rate": 1e-06, + "loss": 0.8949, + "mean_token_accuracy": 0.7258222103118896, + "num_tokens": 230808518.0, + "step": 9115 + }, + { + "epoch": 1.0010981770261367, + "grad_norm": 1.7118399143218994, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7089476585388184, + "num_tokens": 230849339.0, + "step": 9116 + }, + { + "epoch": 1.0012079947287502, + "grad_norm": 2.186929225921631, + "learning_rate": 1e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.6987098455429077, + "num_tokens": 230876437.0, + "step": 9117 + }, + { + "epoch": 1.001317812431364, + "grad_norm": 2.1434035301208496, + "learning_rate": 1e-06, + "loss": 0.8633, + "mean_token_accuracy": 0.7318307757377625, + "num_tokens": 230901902.0, + "step": 9118 + }, + { + "epoch": 1.0014276301339775, + "grad_norm": 2.0438528060913086, + "learning_rate": 1e-06, + "loss": 0.9034, + "mean_token_accuracy": 0.7218761444091797, + "num_tokens": 230932050.0, + "step": 9119 + }, + { + "epoch": 1.0015374478365913, + "grad_norm": 2.2815515995025635, + "learning_rate": 1e-06, + "loss": 0.8696, + "mean_token_accuracy": 0.7294062376022339, + "num_tokens": 230955888.0, + "step": 9120 + }, + { + "epoch": 1.001647265539205, + "grad_norm": 2.069253444671631, + "learning_rate": 1e-06, + "loss": 0.8456, + "mean_token_accuracy": 0.7355877161026001, + "num_tokens": 230984697.0, + "step": 9121 + }, + { + "epoch": 1.0017570832418186, + "grad_norm": 2.13482928276062, + "learning_rate": 1e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.7237892150878906, + "num_tokens": 231010870.0, + "step": 9122 + }, + { + "epoch": 1.0018669009444323, + "grad_norm": 2.3966269493103027, + "learning_rate": 1e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.7047990560531616, + "num_tokens": 231034791.0, + "step": 9123 + }, + { + "epoch": 1.0019767186470458, + "grad_norm": 2.0769753456115723, + "learning_rate": 1e-06, + "loss": 0.8647, + "mean_token_accuracy": 0.7275340557098389, + "num_tokens": 231061960.0, + "step": 9124 + }, + { + "epoch": 1.0020865363496596, + "grad_norm": 1.948074460029602, + "learning_rate": 1e-06, + "loss": 1.0843, + "mean_token_accuracy": 0.6703146696090698, + "num_tokens": 231093466.0, + "step": 9125 + }, + { + "epoch": 1.0021963540522731, + "grad_norm": 2.483457326889038, + "learning_rate": 1e-06, + "loss": 0.8592, + "mean_token_accuracy": 0.7299695611000061, + "num_tokens": 231114210.0, + "step": 9126 + }, + { + "epoch": 1.002306171754887, + "grad_norm": 2.2654480934143066, + "learning_rate": 1e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.7188178300857544, + "num_tokens": 231137997.0, + "step": 9127 + }, + { + "epoch": 1.0024159894575007, + "grad_norm": 2.302189826965332, + "learning_rate": 1e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.6916338205337524, + "num_tokens": 231161840.0, + "step": 9128 + }, + { + "epoch": 1.0025258071601142, + "grad_norm": 2.411327600479126, + "learning_rate": 1e-06, + "loss": 0.8666, + "mean_token_accuracy": 0.7371650338172913, + "num_tokens": 231183718.0, + "step": 9129 + }, + { + "epoch": 1.002635624862728, + "grad_norm": 2.225170135498047, + "learning_rate": 1e-06, + "loss": 0.8367, + "mean_token_accuracy": 0.7426849007606506, + "num_tokens": 231207221.0, + "step": 9130 + }, + { + "epoch": 1.0027454425653415, + "grad_norm": 2.099194049835205, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7121748924255371, + "num_tokens": 231233872.0, + "step": 9131 + }, + { + "epoch": 1.0028552602679552, + "grad_norm": 2.0724380016326904, + "learning_rate": 1e-06, + "loss": 1.0498, + "mean_token_accuracy": 0.6776959896087646, + "num_tokens": 231264334.0, + "step": 9132 + }, + { + "epoch": 1.0029650779705688, + "grad_norm": 2.215512990951538, + "learning_rate": 1e-06, + "loss": 0.8808, + "mean_token_accuracy": 0.7307931780815125, + "num_tokens": 231289293.0, + "step": 9133 + }, + { + "epoch": 1.0030748956731825, + "grad_norm": 2.1129894256591797, + "learning_rate": 1e-06, + "loss": 0.8505, + "mean_token_accuracy": 0.7294940948486328, + "num_tokens": 231314144.0, + "step": 9134 + }, + { + "epoch": 1.0031847133757963, + "grad_norm": 2.245249032974243, + "learning_rate": 1e-06, + "loss": 0.9196, + "mean_token_accuracy": 0.7171655893325806, + "num_tokens": 231340444.0, + "step": 9135 + }, + { + "epoch": 1.0032945310784098, + "grad_norm": 2.1515181064605713, + "learning_rate": 1e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7070374488830566, + "num_tokens": 231366926.0, + "step": 9136 + }, + { + "epoch": 1.0034043487810236, + "grad_norm": 2.2386467456817627, + "learning_rate": 1e-06, + "loss": 0.8457, + "mean_token_accuracy": 0.7374178767204285, + "num_tokens": 231390113.0, + "step": 9137 + }, + { + "epoch": 1.0035141664836371, + "grad_norm": 2.2715234756469727, + "learning_rate": 1e-06, + "loss": 0.8567, + "mean_token_accuracy": 0.7310655117034912, + "num_tokens": 231412798.0, + "step": 9138 + }, + { + "epoch": 1.0036239841862509, + "grad_norm": 2.174257278442383, + "learning_rate": 1e-06, + "loss": 0.9013, + "mean_token_accuracy": 0.7131267189979553, + "num_tokens": 231438973.0, + "step": 9139 + }, + { + "epoch": 1.0037338018888644, + "grad_norm": 2.5120420455932617, + "learning_rate": 1e-06, + "loss": 0.8871, + "mean_token_accuracy": 0.7203637361526489, + "num_tokens": 231461571.0, + "step": 9140 + }, + { + "epoch": 1.0038436195914782, + "grad_norm": 2.2225303649902344, + "learning_rate": 1e-06, + "loss": 0.9534, + "mean_token_accuracy": 0.7081679105758667, + "num_tokens": 231486794.0, + "step": 9141 + }, + { + "epoch": 1.0039534372940917, + "grad_norm": 2.026832103729248, + "learning_rate": 1e-06, + "loss": 1.0085, + "mean_token_accuracy": 0.6903403997421265, + "num_tokens": 231519281.0, + "step": 9142 + }, + { + "epoch": 1.0040632549967055, + "grad_norm": 2.0001060962677, + "learning_rate": 1e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7098474502563477, + "num_tokens": 231549264.0, + "step": 9143 + }, + { + "epoch": 1.0041730726993192, + "grad_norm": 2.7753407955169678, + "learning_rate": 1e-06, + "loss": 0.8296, + "mean_token_accuracy": 0.7383077144622803, + "num_tokens": 231568029.0, + "step": 9144 + }, + { + "epoch": 1.0042828904019327, + "grad_norm": 1.992506504058838, + "learning_rate": 1e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.719489574432373, + "num_tokens": 231598032.0, + "step": 9145 + }, + { + "epoch": 1.0043927081045465, + "grad_norm": 2.1798107624053955, + "learning_rate": 1e-06, + "loss": 0.9227, + "mean_token_accuracy": 0.7133811712265015, + "num_tokens": 231624601.0, + "step": 9146 + }, + { + "epoch": 1.00450252580716, + "grad_norm": 1.8133211135864258, + "learning_rate": 1e-06, + "loss": 0.9995, + "mean_token_accuracy": 0.696852445602417, + "num_tokens": 231660499.0, + "step": 9147 + }, + { + "epoch": 1.0046123435097738, + "grad_norm": 2.2240707874298096, + "learning_rate": 1e-06, + "loss": 0.8457, + "mean_token_accuracy": 0.7271060943603516, + "num_tokens": 231686015.0, + "step": 9148 + }, + { + "epoch": 1.0047221612123873, + "grad_norm": 2.1976356506347656, + "learning_rate": 1e-06, + "loss": 0.9056, + "mean_token_accuracy": 0.7182106375694275, + "num_tokens": 231709900.0, + "step": 9149 + }, + { + "epoch": 1.004831978915001, + "grad_norm": 2.7967541217803955, + "learning_rate": 1e-06, + "loss": 0.8677, + "mean_token_accuracy": 0.7320587635040283, + "num_tokens": 231726624.0, + "step": 9150 + }, + { + "epoch": 1.0049417966176148, + "grad_norm": 2.387408971786499, + "learning_rate": 1e-06, + "loss": 0.986, + "mean_token_accuracy": 0.7053807973861694, + "num_tokens": 231752729.0, + "step": 9151 + }, + { + "epoch": 1.0050516143202284, + "grad_norm": 2.052156925201416, + "learning_rate": 1e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.7021949291229248, + "num_tokens": 231782869.0, + "step": 9152 + }, + { + "epoch": 1.0051614320228421, + "grad_norm": 2.3902950286865234, + "learning_rate": 1e-06, + "loss": 0.8474, + "mean_token_accuracy": 0.7286159992218018, + "num_tokens": 231804225.0, + "step": 9153 + }, + { + "epoch": 1.0052712497254557, + "grad_norm": 2.043351173400879, + "learning_rate": 1e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.7089707851409912, + "num_tokens": 231831287.0, + "step": 9154 + }, + { + "epoch": 1.0053810674280694, + "grad_norm": 2.251732110977173, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7228296995162964, + "num_tokens": 231858073.0, + "step": 9155 + }, + { + "epoch": 1.005490885130683, + "grad_norm": 2.023087501525879, + "learning_rate": 1e-06, + "loss": 0.8858, + "mean_token_accuracy": 0.7263889312744141, + "num_tokens": 231887751.0, + "step": 9156 + }, + { + "epoch": 1.0056007028332967, + "grad_norm": 2.2154476642608643, + "learning_rate": 1e-06, + "loss": 0.8638, + "mean_token_accuracy": 0.7325695753097534, + "num_tokens": 231913979.0, + "step": 9157 + }, + { + "epoch": 1.0057105205359105, + "grad_norm": 2.515256881713867, + "learning_rate": 1e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.7017938494682312, + "num_tokens": 231935021.0, + "step": 9158 + }, + { + "epoch": 1.005820338238524, + "grad_norm": 2.071309804916382, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.6992526054382324, + "num_tokens": 231966105.0, + "step": 9159 + }, + { + "epoch": 1.0059301559411378, + "grad_norm": 2.4776957035064697, + "learning_rate": 1e-06, + "loss": 0.8633, + "mean_token_accuracy": 0.729162335395813, + "num_tokens": 231986413.0, + "step": 9160 + }, + { + "epoch": 1.0060399736437513, + "grad_norm": 2.1095049381256104, + "learning_rate": 1e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.7207576632499695, + "num_tokens": 232015101.0, + "step": 9161 + }, + { + "epoch": 1.006149791346365, + "grad_norm": 2.695951461791992, + "learning_rate": 1e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.7158174514770508, + "num_tokens": 232033400.0, + "step": 9162 + }, + { + "epoch": 1.0062596090489786, + "grad_norm": 2.6606454849243164, + "learning_rate": 1e-06, + "loss": 0.9036, + "mean_token_accuracy": 0.7189018130302429, + "num_tokens": 232053482.0, + "step": 9163 + }, + { + "epoch": 1.0063694267515924, + "grad_norm": 2.311086416244507, + "learning_rate": 1e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.7116792798042297, + "num_tokens": 232076129.0, + "step": 9164 + }, + { + "epoch": 1.0064792444542061, + "grad_norm": 2.3759279251098633, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7092486619949341, + "num_tokens": 232100334.0, + "step": 9165 + }, + { + "epoch": 1.0065890621568196, + "grad_norm": 2.240084409713745, + "learning_rate": 1e-06, + "loss": 0.9079, + "mean_token_accuracy": 0.7175561189651489, + "num_tokens": 232126285.0, + "step": 9166 + }, + { + "epoch": 1.0066988798594334, + "grad_norm": 2.5067508220672607, + "learning_rate": 1e-06, + "loss": 0.8739, + "mean_token_accuracy": 0.7295677065849304, + "num_tokens": 232146467.0, + "step": 9167 + }, + { + "epoch": 1.006808697562047, + "grad_norm": 2.331061840057373, + "learning_rate": 1e-06, + "loss": 0.8434, + "mean_token_accuracy": 0.730445384979248, + "num_tokens": 232168866.0, + "step": 9168 + }, + { + "epoch": 1.0069185152646607, + "grad_norm": 2.089266300201416, + "learning_rate": 1e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.7182044386863708, + "num_tokens": 232196869.0, + "step": 9169 + }, + { + "epoch": 1.0070283329672742, + "grad_norm": 2.3557937145233154, + "learning_rate": 1e-06, + "loss": 0.837, + "mean_token_accuracy": 0.7338396906852722, + "num_tokens": 232217481.0, + "step": 9170 + }, + { + "epoch": 1.007138150669888, + "grad_norm": 2.2517213821411133, + "learning_rate": 1e-06, + "loss": 0.8839, + "mean_token_accuracy": 0.7244632244110107, + "num_tokens": 232243100.0, + "step": 9171 + }, + { + "epoch": 1.0072479683725017, + "grad_norm": 2.3232414722442627, + "learning_rate": 1e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.7135975956916809, + "num_tokens": 232266149.0, + "step": 9172 + }, + { + "epoch": 1.0073577860751153, + "grad_norm": 1.9594216346740723, + "learning_rate": 1e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.7061678171157837, + "num_tokens": 232299718.0, + "step": 9173 + }, + { + "epoch": 1.007467603777729, + "grad_norm": 2.1957945823669434, + "learning_rate": 1e-06, + "loss": 0.912, + "mean_token_accuracy": 0.7310309410095215, + "num_tokens": 232326785.0, + "step": 9174 + }, + { + "epoch": 1.0075774214803426, + "grad_norm": 2.329197883605957, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.715506911277771, + "num_tokens": 232349638.0, + "step": 9175 + }, + { + "epoch": 1.0076872391829563, + "grad_norm": 2.2037060260772705, + "learning_rate": 1e-06, + "loss": 0.8964, + "mean_token_accuracy": 0.7219036817550659, + "num_tokens": 232372748.0, + "step": 9176 + }, + { + "epoch": 1.0077970568855699, + "grad_norm": 2.4654383659362793, + "learning_rate": 1e-06, + "loss": 0.8306, + "mean_token_accuracy": 0.7416191101074219, + "num_tokens": 232394265.0, + "step": 9177 + }, + { + "epoch": 1.0079068745881836, + "grad_norm": 2.2651515007019043, + "learning_rate": 1e-06, + "loss": 0.8727, + "mean_token_accuracy": 0.7252768874168396, + "num_tokens": 232417248.0, + "step": 9178 + }, + { + "epoch": 1.0080166922907974, + "grad_norm": 1.968997836112976, + "learning_rate": 1e-06, + "loss": 0.9824, + "mean_token_accuracy": 0.7039803266525269, + "num_tokens": 232448549.0, + "step": 9179 + }, + { + "epoch": 1.008126509993411, + "grad_norm": 2.2002124786376953, + "learning_rate": 1e-06, + "loss": 0.8302, + "mean_token_accuracy": 0.7365279793739319, + "num_tokens": 232474234.0, + "step": 9180 + }, + { + "epoch": 1.0082363276960247, + "grad_norm": 2.1667134761810303, + "learning_rate": 1e-06, + "loss": 0.8301, + "mean_token_accuracy": 0.7373278141021729, + "num_tokens": 232499542.0, + "step": 9181 + }, + { + "epoch": 1.0083461453986382, + "grad_norm": 2.1122589111328125, + "learning_rate": 1e-06, + "loss": 0.8498, + "mean_token_accuracy": 0.7364211678504944, + "num_tokens": 232524330.0, + "step": 9182 + }, + { + "epoch": 1.008455963101252, + "grad_norm": 2.4725735187530518, + "learning_rate": 1e-06, + "loss": 0.8213, + "mean_token_accuracy": 0.7429288625717163, + "num_tokens": 232545687.0, + "step": 9183 + }, + { + "epoch": 1.0085657808038655, + "grad_norm": 2.522890567779541, + "learning_rate": 1e-06, + "loss": 0.926, + "mean_token_accuracy": 0.7117575407028198, + "num_tokens": 232566355.0, + "step": 9184 + }, + { + "epoch": 1.0086755985064793, + "grad_norm": 2.2553536891937256, + "learning_rate": 1e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7069618105888367, + "num_tokens": 232592797.0, + "step": 9185 + }, + { + "epoch": 1.008785416209093, + "grad_norm": 2.121432065963745, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.7117263674736023, + "num_tokens": 232621372.0, + "step": 9186 + }, + { + "epoch": 1.0088952339117065, + "grad_norm": 2.252305746078491, + "learning_rate": 1e-06, + "loss": 1.0073, + "mean_token_accuracy": 0.6983342170715332, + "num_tokens": 232648900.0, + "step": 9187 + }, + { + "epoch": 1.0090050516143203, + "grad_norm": 2.312453031539917, + "learning_rate": 1e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.6963136792182922, + "num_tokens": 232675184.0, + "step": 9188 + }, + { + "epoch": 1.0091148693169338, + "grad_norm": 2.202746629714966, + "learning_rate": 1e-06, + "loss": 0.8862, + "mean_token_accuracy": 0.7216843962669373, + "num_tokens": 232700110.0, + "step": 9189 + }, + { + "epoch": 1.0092246870195476, + "grad_norm": 2.7365660667419434, + "learning_rate": 1e-06, + "loss": 0.7447, + "mean_token_accuracy": 0.7571361064910889, + "num_tokens": 232716601.0, + "step": 9190 + }, + { + "epoch": 1.0093345047221611, + "grad_norm": 2.2149300575256348, + "learning_rate": 1e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7218155264854431, + "num_tokens": 232742816.0, + "step": 9191 + }, + { + "epoch": 1.0094443224247749, + "grad_norm": 2.265704870223999, + "learning_rate": 1e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.7116422653198242, + "num_tokens": 232767379.0, + "step": 9192 + }, + { + "epoch": 1.0095541401273886, + "grad_norm": 2.5659291744232178, + "learning_rate": 1e-06, + "loss": 0.7942, + "mean_token_accuracy": 0.7397628426551819, + "num_tokens": 232785246.0, + "step": 9193 + }, + { + "epoch": 1.0096639578300022, + "grad_norm": 2.401082754135132, + "learning_rate": 1e-06, + "loss": 0.9198, + "mean_token_accuracy": 0.7159824371337891, + "num_tokens": 232806708.0, + "step": 9194 + }, + { + "epoch": 1.009773775532616, + "grad_norm": 1.9357244968414307, + "learning_rate": 1e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.712748646736145, + "num_tokens": 232839253.0, + "step": 9195 + }, + { + "epoch": 1.0098835932352295, + "grad_norm": 2.4660680294036865, + "learning_rate": 1e-06, + "loss": 0.8743, + "mean_token_accuracy": 0.729015052318573, + "num_tokens": 232860758.0, + "step": 9196 + }, + { + "epoch": 1.0099934109378432, + "grad_norm": 2.3466150760650635, + "learning_rate": 1e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7111724615097046, + "num_tokens": 232883368.0, + "step": 9197 + }, + { + "epoch": 1.0101032286404568, + "grad_norm": 2.1845405101776123, + "learning_rate": 1e-06, + "loss": 0.9133, + "mean_token_accuracy": 0.719244122505188, + "num_tokens": 232910643.0, + "step": 9198 + }, + { + "epoch": 1.0102130463430705, + "grad_norm": 2.0565881729125977, + "learning_rate": 1e-06, + "loss": 0.962, + "mean_token_accuracy": 0.6988025903701782, + "num_tokens": 232939878.0, + "step": 9199 + }, + { + "epoch": 1.010322864045684, + "grad_norm": 2.1883351802825928, + "learning_rate": 1e-06, + "loss": 0.9064, + "mean_token_accuracy": 0.7156493663787842, + "num_tokens": 232966469.0, + "step": 9200 + }, + { + "epoch": 1.0104326817482978, + "grad_norm": 2.0927646160125732, + "learning_rate": 1e-06, + "loss": 0.9983, + "mean_token_accuracy": 0.6902408599853516, + "num_tokens": 232993854.0, + "step": 9201 + }, + { + "epoch": 1.0105424994509116, + "grad_norm": 2.180854082107544, + "learning_rate": 1e-06, + "loss": 0.8725, + "mean_token_accuracy": 0.7226201891899109, + "num_tokens": 233019963.0, + "step": 9202 + }, + { + "epoch": 1.010652317153525, + "grad_norm": 1.989770770072937, + "learning_rate": 1e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.7163026928901672, + "num_tokens": 233052838.0, + "step": 9203 + }, + { + "epoch": 1.0107621348561389, + "grad_norm": 2.192120313644409, + "learning_rate": 1e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7124634981155396, + "num_tokens": 233079810.0, + "step": 9204 + }, + { + "epoch": 1.0108719525587524, + "grad_norm": 2.0658926963806152, + "learning_rate": 1e-06, + "loss": 0.8486, + "mean_token_accuracy": 0.7418326735496521, + "num_tokens": 233106796.0, + "step": 9205 + }, + { + "epoch": 1.0109817702613662, + "grad_norm": 2.244852304458618, + "learning_rate": 1e-06, + "loss": 0.8761, + "mean_token_accuracy": 0.726318895816803, + "num_tokens": 233132615.0, + "step": 9206 + }, + { + "epoch": 1.0110915879639797, + "grad_norm": 2.2033512592315674, + "learning_rate": 1e-06, + "loss": 0.966, + "mean_token_accuracy": 0.714970588684082, + "num_tokens": 233160159.0, + "step": 9207 + }, + { + "epoch": 1.0112014056665934, + "grad_norm": 2.362103223800659, + "learning_rate": 1e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.6977323293685913, + "num_tokens": 233184083.0, + "step": 9208 + }, + { + "epoch": 1.0113112233692072, + "grad_norm": 1.9383246898651123, + "learning_rate": 1e-06, + "loss": 0.8891, + "mean_token_accuracy": 0.7213760018348694, + "num_tokens": 233217743.0, + "step": 9209 + }, + { + "epoch": 1.0114210410718207, + "grad_norm": 2.064626693725586, + "learning_rate": 1e-06, + "loss": 1.0163, + "mean_token_accuracy": 0.6903508305549622, + "num_tokens": 233253979.0, + "step": 9210 + }, + { + "epoch": 1.0115308587744345, + "grad_norm": 2.325080156326294, + "learning_rate": 1e-06, + "loss": 0.8668, + "mean_token_accuracy": 0.7365087866783142, + "num_tokens": 233278476.0, + "step": 9211 + }, + { + "epoch": 1.011640676477048, + "grad_norm": 2.3028855323791504, + "learning_rate": 1e-06, + "loss": 0.8286, + "mean_token_accuracy": 0.7329766750335693, + "num_tokens": 233300700.0, + "step": 9212 + }, + { + "epoch": 1.0117504941796618, + "grad_norm": 2.410797595977783, + "learning_rate": 1e-06, + "loss": 0.88, + "mean_token_accuracy": 0.7356305718421936, + "num_tokens": 233322949.0, + "step": 9213 + }, + { + "epoch": 1.0118603118822753, + "grad_norm": 1.9920568466186523, + "learning_rate": 1e-06, + "loss": 0.9198, + "mean_token_accuracy": 0.7193305492401123, + "num_tokens": 233352410.0, + "step": 9214 + }, + { + "epoch": 1.011970129584889, + "grad_norm": 2.286585807800293, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7139530181884766, + "num_tokens": 233377041.0, + "step": 9215 + }, + { + "epoch": 1.0120799472875028, + "grad_norm": 2.253920078277588, + "learning_rate": 1e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.7210923433303833, + "num_tokens": 233402363.0, + "step": 9216 + }, + { + "epoch": 1.0121897649901164, + "grad_norm": 2.4056620597839355, + "learning_rate": 1e-06, + "loss": 0.9413, + "mean_token_accuracy": 0.714128851890564, + "num_tokens": 233425322.0, + "step": 9217 + }, + { + "epoch": 1.0122995826927301, + "grad_norm": 2.5292303562164307, + "learning_rate": 1e-06, + "loss": 0.8311, + "mean_token_accuracy": 0.7274148464202881, + "num_tokens": 233444476.0, + "step": 9218 + }, + { + "epoch": 1.0124094003953437, + "grad_norm": 2.0749735832214355, + "learning_rate": 1e-06, + "loss": 0.8977, + "mean_token_accuracy": 0.7181411981582642, + "num_tokens": 233474723.0, + "step": 9219 + }, + { + "epoch": 1.0125192180979574, + "grad_norm": 2.2837929725646973, + "learning_rate": 1e-06, + "loss": 0.7275, + "mean_token_accuracy": 0.76996910572052, + "num_tokens": 233497108.0, + "step": 9220 + }, + { + "epoch": 1.012629035800571, + "grad_norm": 2.7378618717193604, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7162054181098938, + "num_tokens": 233516572.0, + "step": 9221 + }, + { + "epoch": 1.0127388535031847, + "grad_norm": 2.7041244506835938, + "learning_rate": 1e-06, + "loss": 0.8049, + "mean_token_accuracy": 0.7447421550750732, + "num_tokens": 233535192.0, + "step": 9222 + }, + { + "epoch": 1.0128486712057985, + "grad_norm": 2.54398250579834, + "learning_rate": 1e-06, + "loss": 0.9894, + "mean_token_accuracy": 0.7058351039886475, + "num_tokens": 233557862.0, + "step": 9223 + }, + { + "epoch": 1.012958488908412, + "grad_norm": 2.046647071838379, + "learning_rate": 1e-06, + "loss": 0.9567, + "mean_token_accuracy": 0.7020251154899597, + "num_tokens": 233589977.0, + "step": 9224 + }, + { + "epoch": 1.0130683066110258, + "grad_norm": 2.798259973526001, + "learning_rate": 1e-06, + "loss": 0.8387, + "mean_token_accuracy": 0.7375000715255737, + "num_tokens": 233612051.0, + "step": 9225 + }, + { + "epoch": 1.0131781243136393, + "grad_norm": 2.1422500610351562, + "learning_rate": 1e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.721176266670227, + "num_tokens": 233640882.0, + "step": 9226 + }, + { + "epoch": 1.013287942016253, + "grad_norm": 2.4764833450317383, + "learning_rate": 1e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.7303517460823059, + "num_tokens": 233661681.0, + "step": 9227 + }, + { + "epoch": 1.0133977597188666, + "grad_norm": 2.555556297302246, + "learning_rate": 1e-06, + "loss": 0.8781, + "mean_token_accuracy": 0.7243739366531372, + "num_tokens": 233682590.0, + "step": 9228 + }, + { + "epoch": 1.0135075774214803, + "grad_norm": 2.28155517578125, + "learning_rate": 1e-06, + "loss": 0.8138, + "mean_token_accuracy": 0.7431272268295288, + "num_tokens": 233705688.0, + "step": 9229 + }, + { + "epoch": 1.013617395124094, + "grad_norm": 2.0995242595672607, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.720763087272644, + "num_tokens": 233733549.0, + "step": 9230 + }, + { + "epoch": 1.0137272128267076, + "grad_norm": 2.3517627716064453, + "learning_rate": 1e-06, + "loss": 0.8323, + "mean_token_accuracy": 0.7358123064041138, + "num_tokens": 233755712.0, + "step": 9231 + }, + { + "epoch": 1.0138370305293214, + "grad_norm": 2.7244200706481934, + "learning_rate": 1e-06, + "loss": 0.8831, + "mean_token_accuracy": 0.7295525074005127, + "num_tokens": 233775928.0, + "step": 9232 + }, + { + "epoch": 1.013946848231935, + "grad_norm": 2.068695068359375, + "learning_rate": 1e-06, + "loss": 0.9035, + "mean_token_accuracy": 0.7168588042259216, + "num_tokens": 233805023.0, + "step": 9233 + }, + { + "epoch": 1.0140566659345487, + "grad_norm": 2.156407594680786, + "learning_rate": 1e-06, + "loss": 0.96, + "mean_token_accuracy": 0.7048015594482422, + "num_tokens": 233833588.0, + "step": 9234 + }, + { + "epoch": 1.0141664836371622, + "grad_norm": 2.5772154331207275, + "learning_rate": 1e-06, + "loss": 0.8966, + "mean_token_accuracy": 0.7329022884368896, + "num_tokens": 233854389.0, + "step": 9235 + }, + { + "epoch": 1.014276301339776, + "grad_norm": 2.4079346656799316, + "learning_rate": 1e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.7080218195915222, + "num_tokens": 233875787.0, + "step": 9236 + }, + { + "epoch": 1.0143861190423897, + "grad_norm": 2.2957494258880615, + "learning_rate": 1e-06, + "loss": 0.9525, + "mean_token_accuracy": 0.7055356502532959, + "num_tokens": 233903556.0, + "step": 9237 + }, + { + "epoch": 1.0144959367450033, + "grad_norm": 2.5372445583343506, + "learning_rate": 1e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.7151373624801636, + "num_tokens": 233924380.0, + "step": 9238 + }, + { + "epoch": 1.014605754447617, + "grad_norm": 2.3381478786468506, + "learning_rate": 1e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.737480640411377, + "num_tokens": 233947423.0, + "step": 9239 + }, + { + "epoch": 1.0147155721502306, + "grad_norm": 2.0492372512817383, + "learning_rate": 1e-06, + "loss": 0.9035, + "mean_token_accuracy": 0.7199370861053467, + "num_tokens": 233977340.0, + "step": 9240 + }, + { + "epoch": 1.0148253898528443, + "grad_norm": 2.178300380706787, + "learning_rate": 1e-06, + "loss": 0.9195, + "mean_token_accuracy": 0.7138224840164185, + "num_tokens": 234003526.0, + "step": 9241 + }, + { + "epoch": 1.0149352075554579, + "grad_norm": 2.466430425643921, + "learning_rate": 1e-06, + "loss": 0.8563, + "mean_token_accuracy": 0.7310478687286377, + "num_tokens": 234025099.0, + "step": 9242 + }, + { + "epoch": 1.0150450252580716, + "grad_norm": 2.280456066131592, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7158195972442627, + "num_tokens": 234049137.0, + "step": 9243 + }, + { + "epoch": 1.0151548429606854, + "grad_norm": 2.4265382289886475, + "learning_rate": 1e-06, + "loss": 0.8342, + "mean_token_accuracy": 0.7346053123474121, + "num_tokens": 234070593.0, + "step": 9244 + }, + { + "epoch": 1.015264660663299, + "grad_norm": 1.9592801332473755, + "learning_rate": 1e-06, + "loss": 0.9953, + "mean_token_accuracy": 0.6959289312362671, + "num_tokens": 234100772.0, + "step": 9245 + }, + { + "epoch": 1.0153744783659127, + "grad_norm": 2.033013343811035, + "learning_rate": 1e-06, + "loss": 0.9109, + "mean_token_accuracy": 0.7176132202148438, + "num_tokens": 234132160.0, + "step": 9246 + }, + { + "epoch": 1.0154842960685262, + "grad_norm": 2.2349917888641357, + "learning_rate": 1e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.7231072187423706, + "num_tokens": 234158469.0, + "step": 9247 + }, + { + "epoch": 1.01559411377114, + "grad_norm": 2.058119773864746, + "learning_rate": 1e-06, + "loss": 0.97, + "mean_token_accuracy": 0.711134672164917, + "num_tokens": 234186130.0, + "step": 9248 + }, + { + "epoch": 1.0157039314737535, + "grad_norm": 2.1163158416748047, + "learning_rate": 1e-06, + "loss": 0.8397, + "mean_token_accuracy": 0.7375041842460632, + "num_tokens": 234211865.0, + "step": 9249 + }, + { + "epoch": 1.0158137491763672, + "grad_norm": 2.295553684234619, + "learning_rate": 1e-06, + "loss": 0.8656, + "mean_token_accuracy": 0.7425296306610107, + "num_tokens": 234233681.0, + "step": 9250 + }, + { + "epoch": 1.015923566878981, + "grad_norm": 2.5263752937316895, + "learning_rate": 1e-06, + "loss": 0.9146, + "mean_token_accuracy": 0.7104510068893433, + "num_tokens": 234255180.0, + "step": 9251 + }, + { + "epoch": 1.0160333845815945, + "grad_norm": 2.339250087738037, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.7109079360961914, + "num_tokens": 234279011.0, + "step": 9252 + }, + { + "epoch": 1.0161432022842083, + "grad_norm": 2.2942137718200684, + "learning_rate": 1e-06, + "loss": 0.7981, + "mean_token_accuracy": 0.7464232444763184, + "num_tokens": 234301744.0, + "step": 9253 + }, + { + "epoch": 1.0162530199868218, + "grad_norm": 2.2355008125305176, + "learning_rate": 1e-06, + "loss": 0.8641, + "mean_token_accuracy": 0.7343730926513672, + "num_tokens": 234326326.0, + "step": 9254 + }, + { + "epoch": 1.0163628376894356, + "grad_norm": 2.4287188053131104, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7117770314216614, + "num_tokens": 234347846.0, + "step": 9255 + }, + { + "epoch": 1.0164726553920491, + "grad_norm": 2.338693380355835, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7103970050811768, + "num_tokens": 234371100.0, + "step": 9256 + }, + { + "epoch": 1.0165824730946629, + "grad_norm": 2.423964738845825, + "learning_rate": 1e-06, + "loss": 0.8265, + "mean_token_accuracy": 0.7369427680969238, + "num_tokens": 234391333.0, + "step": 9257 + }, + { + "epoch": 1.0166922907972764, + "grad_norm": 2.294949769973755, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7032088041305542, + "num_tokens": 234414349.0, + "step": 9258 + }, + { + "epoch": 1.0168021084998902, + "grad_norm": 2.427471876144409, + "learning_rate": 1e-06, + "loss": 0.8677, + "mean_token_accuracy": 0.7314205765724182, + "num_tokens": 234437308.0, + "step": 9259 + }, + { + "epoch": 1.016911926202504, + "grad_norm": 1.9522827863693237, + "learning_rate": 1e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.695052981376648, + "num_tokens": 234467301.0, + "step": 9260 + }, + { + "epoch": 1.0170217439051175, + "grad_norm": 2.3077969551086426, + "learning_rate": 1e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.7084109783172607, + "num_tokens": 234491421.0, + "step": 9261 + }, + { + "epoch": 1.0171315616077312, + "grad_norm": 2.153146982192993, + "learning_rate": 1e-06, + "loss": 0.9668, + "mean_token_accuracy": 0.7040504217147827, + "num_tokens": 234519430.0, + "step": 9262 + }, + { + "epoch": 1.0172413793103448, + "grad_norm": 2.3587944507598877, + "learning_rate": 1e-06, + "loss": 0.83, + "mean_token_accuracy": 0.7353265285491943, + "num_tokens": 234539969.0, + "step": 9263 + }, + { + "epoch": 1.0173511970129585, + "grad_norm": 2.4650321006774902, + "learning_rate": 1e-06, + "loss": 0.7702, + "mean_token_accuracy": 0.751929759979248, + "num_tokens": 234560097.0, + "step": 9264 + }, + { + "epoch": 1.017461014715572, + "grad_norm": 2.350085496902466, + "learning_rate": 1e-06, + "loss": 0.9012, + "mean_token_accuracy": 0.7206994295120239, + "num_tokens": 234583250.0, + "step": 9265 + }, + { + "epoch": 1.0175708324181858, + "grad_norm": 2.195383071899414, + "learning_rate": 1e-06, + "loss": 0.8932, + "mean_token_accuracy": 0.7257039546966553, + "num_tokens": 234610050.0, + "step": 9266 + }, + { + "epoch": 1.0176806501207996, + "grad_norm": 2.1084203720092773, + "learning_rate": 1e-06, + "loss": 0.8836, + "mean_token_accuracy": 0.7285493016242981, + "num_tokens": 234638240.0, + "step": 9267 + }, + { + "epoch": 1.017790467823413, + "grad_norm": 2.33166766166687, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.6962537169456482, + "num_tokens": 234663575.0, + "step": 9268 + }, + { + "epoch": 1.0179002855260268, + "grad_norm": 2.640678644180298, + "learning_rate": 1e-06, + "loss": 0.8836, + "mean_token_accuracy": 0.7240169048309326, + "num_tokens": 234681322.0, + "step": 9269 + }, + { + "epoch": 1.0180101032286404, + "grad_norm": 2.3291642665863037, + "learning_rate": 1e-06, + "loss": 0.9227, + "mean_token_accuracy": 0.7087189555168152, + "num_tokens": 234705568.0, + "step": 9270 + }, + { + "epoch": 1.0181199209312541, + "grad_norm": 2.1745522022247314, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7139703631401062, + "num_tokens": 234732409.0, + "step": 9271 + }, + { + "epoch": 1.0182297386338677, + "grad_norm": 1.9292298555374146, + "learning_rate": 1e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7175908088684082, + "num_tokens": 234763946.0, + "step": 9272 + }, + { + "epoch": 1.0183395563364814, + "grad_norm": 1.977244257926941, + "learning_rate": 1e-06, + "loss": 0.7872, + "mean_token_accuracy": 0.7527860403060913, + "num_tokens": 234792255.0, + "step": 9273 + }, + { + "epoch": 1.0184493740390952, + "grad_norm": 2.498032569885254, + "learning_rate": 1e-06, + "loss": 0.8808, + "mean_token_accuracy": 0.725409984588623, + "num_tokens": 234813122.0, + "step": 9274 + }, + { + "epoch": 1.0185591917417087, + "grad_norm": 2.1501057147979736, + "learning_rate": 1e-06, + "loss": 1.0012, + "mean_token_accuracy": 0.690628170967102, + "num_tokens": 234841657.0, + "step": 9275 + }, + { + "epoch": 1.0186690094443225, + "grad_norm": 2.345252513885498, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7095335721969604, + "num_tokens": 234866367.0, + "step": 9276 + }, + { + "epoch": 1.018778827146936, + "grad_norm": 2.720015287399292, + "learning_rate": 1e-06, + "loss": 0.8572, + "mean_token_accuracy": 0.7394316792488098, + "num_tokens": 234883901.0, + "step": 9277 + }, + { + "epoch": 1.0188886448495498, + "grad_norm": 2.1864988803863525, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7086742520332336, + "num_tokens": 234909901.0, + "step": 9278 + }, + { + "epoch": 1.0189984625521633, + "grad_norm": 2.109504461288452, + "learning_rate": 1e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7124615907669067, + "num_tokens": 234937582.0, + "step": 9279 + }, + { + "epoch": 1.019108280254777, + "grad_norm": 2.330061435699463, + "learning_rate": 1e-06, + "loss": 0.9107, + "mean_token_accuracy": 0.7190805077552795, + "num_tokens": 234962468.0, + "step": 9280 + }, + { + "epoch": 1.0192180979573908, + "grad_norm": 2.5299489498138428, + "learning_rate": 1e-06, + "loss": 0.8693, + "mean_token_accuracy": 0.7338395714759827, + "num_tokens": 234982090.0, + "step": 9281 + }, + { + "epoch": 1.0193279156600044, + "grad_norm": 2.471170425415039, + "learning_rate": 1e-06, + "loss": 0.8797, + "mean_token_accuracy": 0.7243461608886719, + "num_tokens": 235003543.0, + "step": 9282 + }, + { + "epoch": 1.0194377333626181, + "grad_norm": 2.5485103130340576, + "learning_rate": 1e-06, + "loss": 0.8812, + "mean_token_accuracy": 0.7224355340003967, + "num_tokens": 235024247.0, + "step": 9283 + }, + { + "epoch": 1.0195475510652316, + "grad_norm": 2.1754448413848877, + "learning_rate": 1e-06, + "loss": 0.8671, + "mean_token_accuracy": 0.7302194833755493, + "num_tokens": 235049996.0, + "step": 9284 + }, + { + "epoch": 1.0196573687678454, + "grad_norm": 2.309676170349121, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.710578441619873, + "num_tokens": 235074729.0, + "step": 9285 + }, + { + "epoch": 1.019767186470459, + "grad_norm": 2.022221326828003, + "learning_rate": 1e-06, + "loss": 0.926, + "mean_token_accuracy": 0.7170092463493347, + "num_tokens": 235104054.0, + "step": 9286 + }, + { + "epoch": 1.0198770041730727, + "grad_norm": 2.255830764770508, + "learning_rate": 1e-06, + "loss": 0.9079, + "mean_token_accuracy": 0.7198694348335266, + "num_tokens": 235128650.0, + "step": 9287 + }, + { + "epoch": 1.0199868218756865, + "grad_norm": 1.8564597368240356, + "learning_rate": 1e-06, + "loss": 0.944, + "mean_token_accuracy": 0.7013382315635681, + "num_tokens": 235165632.0, + "step": 9288 + }, + { + "epoch": 1.0200966395783, + "grad_norm": 2.3678133487701416, + "learning_rate": 1e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.7220070362091064, + "num_tokens": 235188331.0, + "step": 9289 + }, + { + "epoch": 1.0202064572809137, + "grad_norm": 2.200324058532715, + "learning_rate": 1e-06, + "loss": 0.8712, + "mean_token_accuracy": 0.7341580390930176, + "num_tokens": 235213718.0, + "step": 9290 + }, + { + "epoch": 1.0203162749835273, + "grad_norm": 2.485382080078125, + "learning_rate": 1e-06, + "loss": 0.9332, + "mean_token_accuracy": 0.7081643342971802, + "num_tokens": 235234373.0, + "step": 9291 + }, + { + "epoch": 1.020426092686141, + "grad_norm": 2.3002026081085205, + "learning_rate": 1e-06, + "loss": 0.7682, + "mean_token_accuracy": 0.7519873976707458, + "num_tokens": 235256604.0, + "step": 9292 + }, + { + "epoch": 1.0205359103887546, + "grad_norm": 1.9767509698867798, + "learning_rate": 1e-06, + "loss": 0.855, + "mean_token_accuracy": 0.7451504468917847, + "num_tokens": 235287523.0, + "step": 9293 + }, + { + "epoch": 1.0206457280913683, + "grad_norm": 2.3088793754577637, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7067042589187622, + "num_tokens": 235314002.0, + "step": 9294 + }, + { + "epoch": 1.020755545793982, + "grad_norm": 2.545184373855591, + "learning_rate": 1e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.7183059453964233, + "num_tokens": 235334849.0, + "step": 9295 + }, + { + "epoch": 1.0208653634965956, + "grad_norm": 2.463515281677246, + "learning_rate": 1e-06, + "loss": 0.7844, + "mean_token_accuracy": 0.7475156784057617, + "num_tokens": 235354706.0, + "step": 9296 + }, + { + "epoch": 1.0209751811992094, + "grad_norm": 2.2984185218811035, + "learning_rate": 1e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.7113341093063354, + "num_tokens": 235380230.0, + "step": 9297 + }, + { + "epoch": 1.021084998901823, + "grad_norm": 2.158620595932007, + "learning_rate": 1e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.6992612481117249, + "num_tokens": 235409143.0, + "step": 9298 + }, + { + "epoch": 1.0211948166044367, + "grad_norm": 2.115086317062378, + "learning_rate": 1e-06, + "loss": 0.8836, + "mean_token_accuracy": 0.7222625613212585, + "num_tokens": 235436141.0, + "step": 9299 + }, + { + "epoch": 1.0213046343070502, + "grad_norm": 2.4508354663848877, + "learning_rate": 1e-06, + "loss": 0.8587, + "mean_token_accuracy": 0.7327144145965576, + "num_tokens": 235458579.0, + "step": 9300 + }, + { + "epoch": 1.021414452009664, + "grad_norm": 2.4814977645874023, + "learning_rate": 1e-06, + "loss": 0.9012, + "mean_token_accuracy": 0.71584153175354, + "num_tokens": 235480640.0, + "step": 9301 + }, + { + "epoch": 1.0215242697122777, + "grad_norm": 2.101867914199829, + "learning_rate": 1e-06, + "loss": 0.8741, + "mean_token_accuracy": 0.7250077128410339, + "num_tokens": 235507007.0, + "step": 9302 + }, + { + "epoch": 1.0216340874148913, + "grad_norm": 2.2593302726745605, + "learning_rate": 1e-06, + "loss": 0.8644, + "mean_token_accuracy": 0.7321614623069763, + "num_tokens": 235530254.0, + "step": 9303 + }, + { + "epoch": 1.021743905117505, + "grad_norm": 2.491880416870117, + "learning_rate": 1e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.7149319648742676, + "num_tokens": 235551921.0, + "step": 9304 + }, + { + "epoch": 1.0218537228201185, + "grad_norm": 1.9966342449188232, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7113303542137146, + "num_tokens": 235585146.0, + "step": 9305 + }, + { + "epoch": 1.0219635405227323, + "grad_norm": 2.2617809772491455, + "learning_rate": 1e-06, + "loss": 0.8746, + "mean_token_accuracy": 0.728195309638977, + "num_tokens": 235612125.0, + "step": 9306 + }, + { + "epoch": 1.0220733582253458, + "grad_norm": 2.076927423477173, + "learning_rate": 1e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.7073471546173096, + "num_tokens": 235641138.0, + "step": 9307 + }, + { + "epoch": 1.0221831759279596, + "grad_norm": 2.164361000061035, + "learning_rate": 1e-06, + "loss": 0.8874, + "mean_token_accuracy": 0.7194221019744873, + "num_tokens": 235668534.0, + "step": 9308 + }, + { + "epoch": 1.0222929936305734, + "grad_norm": 2.0075857639312744, + "learning_rate": 1e-06, + "loss": 0.8856, + "mean_token_accuracy": 0.7250522375106812, + "num_tokens": 235696795.0, + "step": 9309 + }, + { + "epoch": 1.022402811333187, + "grad_norm": 2.5095865726470947, + "learning_rate": 1e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7063313722610474, + "num_tokens": 235720176.0, + "step": 9310 + }, + { + "epoch": 1.0225126290358006, + "grad_norm": 2.360172748565674, + "learning_rate": 1e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7214593291282654, + "num_tokens": 235746549.0, + "step": 9311 + }, + { + "epoch": 1.0226224467384142, + "grad_norm": 2.0775554180145264, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7107088565826416, + "num_tokens": 235776382.0, + "step": 9312 + }, + { + "epoch": 1.022732264441028, + "grad_norm": 2.1063284873962402, + "learning_rate": 1e-06, + "loss": 0.9027, + "mean_token_accuracy": 0.717490017414093, + "num_tokens": 235802161.0, + "step": 9313 + }, + { + "epoch": 1.0228420821436415, + "grad_norm": 2.1647427082061768, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7294974327087402, + "num_tokens": 235828001.0, + "step": 9314 + }, + { + "epoch": 1.0229518998462552, + "grad_norm": 2.3664145469665527, + "learning_rate": 1e-06, + "loss": 0.8813, + "mean_token_accuracy": 0.7253820896148682, + "num_tokens": 235851910.0, + "step": 9315 + }, + { + "epoch": 1.023061717548869, + "grad_norm": 2.2565767765045166, + "learning_rate": 1e-06, + "loss": 0.8017, + "mean_token_accuracy": 0.7459521293640137, + "num_tokens": 235875378.0, + "step": 9316 + }, + { + "epoch": 1.0231715352514825, + "grad_norm": 2.245177984237671, + "learning_rate": 1e-06, + "loss": 0.9138, + "mean_token_accuracy": 0.7154890894889832, + "num_tokens": 235901907.0, + "step": 9317 + }, + { + "epoch": 1.0232813529540963, + "grad_norm": 2.1977949142456055, + "learning_rate": 1e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.7199264168739319, + "num_tokens": 235927749.0, + "step": 9318 + }, + { + "epoch": 1.0233911706567098, + "grad_norm": 2.269108533859253, + "learning_rate": 1e-06, + "loss": 0.9036, + "mean_token_accuracy": 0.7185708284378052, + "num_tokens": 235953767.0, + "step": 9319 + }, + { + "epoch": 1.0235009883593236, + "grad_norm": 2.1435670852661133, + "learning_rate": 1e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.7094823122024536, + "num_tokens": 235985987.0, + "step": 9320 + }, + { + "epoch": 1.023610806061937, + "grad_norm": 2.0200722217559814, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7120497226715088, + "num_tokens": 236016014.0, + "step": 9321 + }, + { + "epoch": 1.0237206237645509, + "grad_norm": 2.600651264190674, + "learning_rate": 1e-06, + "loss": 0.8414, + "mean_token_accuracy": 0.7324708104133606, + "num_tokens": 236035594.0, + "step": 9322 + }, + { + "epoch": 1.0238304414671644, + "grad_norm": 2.2632040977478027, + "learning_rate": 1e-06, + "loss": 0.8932, + "mean_token_accuracy": 0.7229864001274109, + "num_tokens": 236060843.0, + "step": 9323 + }, + { + "epoch": 1.0239402591697782, + "grad_norm": 2.0678975582122803, + "learning_rate": 1e-06, + "loss": 0.9743, + "mean_token_accuracy": 0.7009000778198242, + "num_tokens": 236092768.0, + "step": 9324 + }, + { + "epoch": 1.024050076872392, + "grad_norm": 2.0446181297302246, + "learning_rate": 1e-06, + "loss": 0.8949, + "mean_token_accuracy": 0.7211004495620728, + "num_tokens": 236121311.0, + "step": 9325 + }, + { + "epoch": 1.0241598945750054, + "grad_norm": 2.24385929107666, + "learning_rate": 1e-06, + "loss": 0.8815, + "mean_token_accuracy": 0.7257636189460754, + "num_tokens": 236145934.0, + "step": 9326 + }, + { + "epoch": 1.0242697122776192, + "grad_norm": 2.239711284637451, + "learning_rate": 1e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.7027029991149902, + "num_tokens": 236170721.0, + "step": 9327 + }, + { + "epoch": 1.0243795299802327, + "grad_norm": 2.3924641609191895, + "learning_rate": 1e-06, + "loss": 0.803, + "mean_token_accuracy": 0.7473868131637573, + "num_tokens": 236191068.0, + "step": 9328 + }, + { + "epoch": 1.0244893476828465, + "grad_norm": 2.1805591583251953, + "learning_rate": 1e-06, + "loss": 0.9174, + "mean_token_accuracy": 0.7154974341392517, + "num_tokens": 236215820.0, + "step": 9329 + }, + { + "epoch": 1.02459916538546, + "grad_norm": 2.381432294845581, + "learning_rate": 1e-06, + "loss": 0.84, + "mean_token_accuracy": 0.7288541793823242, + "num_tokens": 236237143.0, + "step": 9330 + }, + { + "epoch": 1.0247089830880738, + "grad_norm": 2.466153144836426, + "learning_rate": 1e-06, + "loss": 0.7596, + "mean_token_accuracy": 0.7518240809440613, + "num_tokens": 236257502.0, + "step": 9331 + }, + { + "epoch": 1.0248188007906875, + "grad_norm": 1.9825984239578247, + "learning_rate": 1e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.7099140286445618, + "num_tokens": 236287677.0, + "step": 9332 + }, + { + "epoch": 1.024928618493301, + "grad_norm": 2.2194583415985107, + "learning_rate": 1e-06, + "loss": 0.8188, + "mean_token_accuracy": 0.7465646266937256, + "num_tokens": 236310862.0, + "step": 9333 + }, + { + "epoch": 1.0250384361959148, + "grad_norm": 2.3474910259246826, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7082499265670776, + "num_tokens": 236333992.0, + "step": 9334 + }, + { + "epoch": 1.0251482538985284, + "grad_norm": 1.9889942407608032, + "learning_rate": 1e-06, + "loss": 0.8188, + "mean_token_accuracy": 0.7373083233833313, + "num_tokens": 236363737.0, + "step": 9335 + }, + { + "epoch": 1.0252580716011421, + "grad_norm": 2.4522151947021484, + "learning_rate": 1e-06, + "loss": 0.8911, + "mean_token_accuracy": 0.7215383052825928, + "num_tokens": 236384166.0, + "step": 9336 + }, + { + "epoch": 1.0253678893037557, + "grad_norm": 2.0292611122131348, + "learning_rate": 1e-06, + "loss": 0.9349, + "mean_token_accuracy": 0.712439775466919, + "num_tokens": 236415556.0, + "step": 9337 + }, + { + "epoch": 1.0254777070063694, + "grad_norm": 2.35567569732666, + "learning_rate": 1e-06, + "loss": 0.827, + "mean_token_accuracy": 0.7355297803878784, + "num_tokens": 236438643.0, + "step": 9338 + }, + { + "epoch": 1.0255875247089832, + "grad_norm": 2.452420949935913, + "learning_rate": 1e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.7387558221817017, + "num_tokens": 236459985.0, + "step": 9339 + }, + { + "epoch": 1.0256973424115967, + "grad_norm": 1.972198247909546, + "learning_rate": 1e-06, + "loss": 0.9869, + "mean_token_accuracy": 0.6985081434249878, + "num_tokens": 236493245.0, + "step": 9340 + }, + { + "epoch": 1.0258071601142105, + "grad_norm": 2.2436888217926025, + "learning_rate": 1e-06, + "loss": 0.8892, + "mean_token_accuracy": 0.7245365381240845, + "num_tokens": 236519252.0, + "step": 9341 + }, + { + "epoch": 1.025916977816824, + "grad_norm": 2.1332075595855713, + "learning_rate": 1e-06, + "loss": 0.9346, + "mean_token_accuracy": 0.709652841091156, + "num_tokens": 236548881.0, + "step": 9342 + }, + { + "epoch": 1.0260267955194378, + "grad_norm": 2.198087215423584, + "learning_rate": 1e-06, + "loss": 0.8317, + "mean_token_accuracy": 0.7380135655403137, + "num_tokens": 236573183.0, + "step": 9343 + }, + { + "epoch": 1.0261366132220513, + "grad_norm": 2.113344669342041, + "learning_rate": 1e-06, + "loss": 0.8951, + "mean_token_accuracy": 0.7250301837921143, + "num_tokens": 236600219.0, + "step": 9344 + }, + { + "epoch": 1.026246430924665, + "grad_norm": 1.9828087091445923, + "learning_rate": 1e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.7017121315002441, + "num_tokens": 236632861.0, + "step": 9345 + }, + { + "epoch": 1.0263562486272788, + "grad_norm": 2.3626039028167725, + "learning_rate": 1e-06, + "loss": 0.8722, + "mean_token_accuracy": 0.7257559299468994, + "num_tokens": 236657615.0, + "step": 9346 + }, + { + "epoch": 1.0264660663298923, + "grad_norm": 1.981186866760254, + "learning_rate": 1e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.7216243147850037, + "num_tokens": 236686022.0, + "step": 9347 + }, + { + "epoch": 1.026575884032506, + "grad_norm": 2.4949817657470703, + "learning_rate": 1e-06, + "loss": 0.9043, + "mean_token_accuracy": 0.7293400764465332, + "num_tokens": 236706759.0, + "step": 9348 + }, + { + "epoch": 1.0266857017351196, + "grad_norm": 2.113986015319824, + "learning_rate": 1e-06, + "loss": 0.9603, + "mean_token_accuracy": 0.7015666365623474, + "num_tokens": 236735211.0, + "step": 9349 + }, + { + "epoch": 1.0267955194377334, + "grad_norm": 2.3310470581054688, + "learning_rate": 1e-06, + "loss": 0.9565, + "mean_token_accuracy": 0.704452633857727, + "num_tokens": 236760512.0, + "step": 9350 + }, + { + "epoch": 1.026905337140347, + "grad_norm": 2.2751142978668213, + "learning_rate": 1e-06, + "loss": 0.8049, + "mean_token_accuracy": 0.7406145334243774, + "num_tokens": 236784144.0, + "step": 9351 + }, + { + "epoch": 1.0270151548429607, + "grad_norm": 2.2312240600585938, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.7115335464477539, + "num_tokens": 236811857.0, + "step": 9352 + }, + { + "epoch": 1.0271249725455744, + "grad_norm": 2.2317638397216797, + "learning_rate": 1e-06, + "loss": 0.8303, + "mean_token_accuracy": 0.7405814528465271, + "num_tokens": 236835583.0, + "step": 9353 + }, + { + "epoch": 1.027234790248188, + "grad_norm": 2.029482841491699, + "learning_rate": 1e-06, + "loss": 0.8591, + "mean_token_accuracy": 0.7241007685661316, + "num_tokens": 236863199.0, + "step": 9354 + }, + { + "epoch": 1.0273446079508017, + "grad_norm": 2.250714063644409, + "learning_rate": 1e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.7214504480361938, + "num_tokens": 236889158.0, + "step": 9355 + }, + { + "epoch": 1.0274544256534153, + "grad_norm": 2.449838399887085, + "learning_rate": 1e-06, + "loss": 0.9446, + "mean_token_accuracy": 0.7106237411499023, + "num_tokens": 236911337.0, + "step": 9356 + }, + { + "epoch": 1.027564243356029, + "grad_norm": 2.0260865688323975, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.707330584526062, + "num_tokens": 236940629.0, + "step": 9357 + }, + { + "epoch": 1.0276740610586426, + "grad_norm": 2.5830740928649902, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7076499462127686, + "num_tokens": 236962058.0, + "step": 9358 + }, + { + "epoch": 1.0277838787612563, + "grad_norm": 2.2041985988616943, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7072169780731201, + "num_tokens": 236987145.0, + "step": 9359 + }, + { + "epoch": 1.02789369646387, + "grad_norm": 2.18100643157959, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7158439755439758, + "num_tokens": 237014826.0, + "step": 9360 + }, + { + "epoch": 1.0280035141664836, + "grad_norm": 2.0345027446746826, + "learning_rate": 1e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.7019404172897339, + "num_tokens": 237049120.0, + "step": 9361 + }, + { + "epoch": 1.0281133318690974, + "grad_norm": 2.117262840270996, + "learning_rate": 1e-06, + "loss": 0.8693, + "mean_token_accuracy": 0.7275288701057434, + "num_tokens": 237076771.0, + "step": 9362 + }, + { + "epoch": 1.028223149571711, + "grad_norm": 2.142350435256958, + "learning_rate": 1e-06, + "loss": 1.0306, + "mean_token_accuracy": 0.6887302994728088, + "num_tokens": 237105069.0, + "step": 9363 + }, + { + "epoch": 1.0283329672743247, + "grad_norm": 2.317516565322876, + "learning_rate": 1e-06, + "loss": 0.8109, + "mean_token_accuracy": 0.7390583753585815, + "num_tokens": 237127800.0, + "step": 9364 + }, + { + "epoch": 1.0284427849769382, + "grad_norm": 2.1114416122436523, + "learning_rate": 1e-06, + "loss": 0.9568, + "mean_token_accuracy": 0.7067337036132812, + "num_tokens": 237158793.0, + "step": 9365 + }, + { + "epoch": 1.028552602679552, + "grad_norm": 2.6434149742126465, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7051378488540649, + "num_tokens": 237179946.0, + "step": 9366 + }, + { + "epoch": 1.0286624203821657, + "grad_norm": 2.5258264541625977, + "learning_rate": 1e-06, + "loss": 0.8358, + "mean_token_accuracy": 0.7363152503967285, + "num_tokens": 237200253.0, + "step": 9367 + }, + { + "epoch": 1.0287722380847792, + "grad_norm": 2.395446300506592, + "learning_rate": 1e-06, + "loss": 0.8584, + "mean_token_accuracy": 0.7379392385482788, + "num_tokens": 237225160.0, + "step": 9368 + }, + { + "epoch": 1.028882055787393, + "grad_norm": 2.282602310180664, + "learning_rate": 1e-06, + "loss": 0.8319, + "mean_token_accuracy": 0.743728756904602, + "num_tokens": 237247843.0, + "step": 9369 + }, + { + "epoch": 1.0289918734900065, + "grad_norm": 2.3235292434692383, + "learning_rate": 1e-06, + "loss": 0.8876, + "mean_token_accuracy": 0.7294131517410278, + "num_tokens": 237274757.0, + "step": 9370 + }, + { + "epoch": 1.0291016911926203, + "grad_norm": 2.175380229949951, + "learning_rate": 1e-06, + "loss": 1.0282, + "mean_token_accuracy": 0.6861339807510376, + "num_tokens": 237305642.0, + "step": 9371 + }, + { + "epoch": 1.0292115088952338, + "grad_norm": 2.1598803997039795, + "learning_rate": 1e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.7196086049079895, + "num_tokens": 237335115.0, + "step": 9372 + }, + { + "epoch": 1.0293213265978476, + "grad_norm": 2.600308895111084, + "learning_rate": 1e-06, + "loss": 0.8674, + "mean_token_accuracy": 0.7323930859565735, + "num_tokens": 237356392.0, + "step": 9373 + }, + { + "epoch": 1.0294311443004613, + "grad_norm": 2.240461826324463, + "learning_rate": 1e-06, + "loss": 0.8796, + "mean_token_accuracy": 0.7200058698654175, + "num_tokens": 237382400.0, + "step": 9374 + }, + { + "epoch": 1.0295409620030749, + "grad_norm": 2.1048336029052734, + "learning_rate": 1e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.7061017751693726, + "num_tokens": 237411999.0, + "step": 9375 + }, + { + "epoch": 1.0296507797056886, + "grad_norm": 2.0573480129241943, + "learning_rate": 1e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.7067524790763855, + "num_tokens": 237440290.0, + "step": 9376 + }, + { + "epoch": 1.0297605974083022, + "grad_norm": 2.4094738960266113, + "learning_rate": 1e-06, + "loss": 0.9097, + "mean_token_accuracy": 0.7143625617027283, + "num_tokens": 237463774.0, + "step": 9377 + }, + { + "epoch": 1.029870415110916, + "grad_norm": 2.3184425830841064, + "learning_rate": 1e-06, + "loss": 0.9493, + "mean_token_accuracy": 0.7103070020675659, + "num_tokens": 237490061.0, + "step": 9378 + }, + { + "epoch": 1.0299802328135295, + "grad_norm": 2.5249011516571045, + "learning_rate": 1e-06, + "loss": 0.8835, + "mean_token_accuracy": 0.7312458753585815, + "num_tokens": 237510617.0, + "step": 9379 + }, + { + "epoch": 1.0300900505161432, + "grad_norm": 2.0605950355529785, + "learning_rate": 1e-06, + "loss": 1.0187, + "mean_token_accuracy": 0.6988177299499512, + "num_tokens": 237542240.0, + "step": 9380 + }, + { + "epoch": 1.030199868218757, + "grad_norm": 2.0186119079589844, + "learning_rate": 1e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.7035683393478394, + "num_tokens": 237569623.0, + "step": 9381 + }, + { + "epoch": 1.0303096859213705, + "grad_norm": 2.350855827331543, + "learning_rate": 1e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7161715030670166, + "num_tokens": 237592804.0, + "step": 9382 + }, + { + "epoch": 1.0304195036239843, + "grad_norm": 2.093611001968384, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.6925654411315918, + "num_tokens": 237622742.0, + "step": 9383 + }, + { + "epoch": 1.0305293213265978, + "grad_norm": 2.1891262531280518, + "learning_rate": 1e-06, + "loss": 0.9407, + "mean_token_accuracy": 0.7155859470367432, + "num_tokens": 237649853.0, + "step": 9384 + }, + { + "epoch": 1.0306391390292116, + "grad_norm": 2.23268985748291, + "learning_rate": 1e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.7209644913673401, + "num_tokens": 237675679.0, + "step": 9385 + }, + { + "epoch": 1.030748956731825, + "grad_norm": 2.3390252590179443, + "learning_rate": 1e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.7052207589149475, + "num_tokens": 237699823.0, + "step": 9386 + }, + { + "epoch": 1.0308587744344389, + "grad_norm": 2.2794532775878906, + "learning_rate": 1e-06, + "loss": 1.0556, + "mean_token_accuracy": 0.6852540969848633, + "num_tokens": 237726186.0, + "step": 9387 + }, + { + "epoch": 1.0309685921370524, + "grad_norm": 2.326227903366089, + "learning_rate": 1e-06, + "loss": 0.9162, + "mean_token_accuracy": 0.7113443613052368, + "num_tokens": 237749839.0, + "step": 9388 + }, + { + "epoch": 1.0310784098396661, + "grad_norm": 2.372206211090088, + "learning_rate": 1e-06, + "loss": 0.9532, + "mean_token_accuracy": 0.7106561660766602, + "num_tokens": 237775052.0, + "step": 9389 + }, + { + "epoch": 1.03118822754228, + "grad_norm": 2.1728451251983643, + "learning_rate": 1e-06, + "loss": 0.9541, + "mean_token_accuracy": 0.7097921967506409, + "num_tokens": 237804129.0, + "step": 9390 + }, + { + "epoch": 1.0312980452448934, + "grad_norm": 2.2039003372192383, + "learning_rate": 1e-06, + "loss": 0.9473, + "mean_token_accuracy": 0.7080021500587463, + "num_tokens": 237830397.0, + "step": 9391 + }, + { + "epoch": 1.0314078629475072, + "grad_norm": 1.9190329313278198, + "learning_rate": 1e-06, + "loss": 0.8772, + "mean_token_accuracy": 0.7200477123260498, + "num_tokens": 237864060.0, + "step": 9392 + }, + { + "epoch": 1.0315176806501207, + "grad_norm": 2.292020082473755, + "learning_rate": 1e-06, + "loss": 0.8467, + "mean_token_accuracy": 0.7317624688148499, + "num_tokens": 237885586.0, + "step": 9393 + }, + { + "epoch": 1.0316274983527345, + "grad_norm": 2.4277596473693848, + "learning_rate": 1e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.7118805646896362, + "num_tokens": 237907809.0, + "step": 9394 + }, + { + "epoch": 1.031737316055348, + "grad_norm": 2.4955854415893555, + "learning_rate": 1e-06, + "loss": 0.8636, + "mean_token_accuracy": 0.7246429920196533, + "num_tokens": 237930007.0, + "step": 9395 + }, + { + "epoch": 1.0318471337579618, + "grad_norm": 2.66921329498291, + "learning_rate": 1e-06, + "loss": 0.8117, + "mean_token_accuracy": 0.7444350123405457, + "num_tokens": 237949399.0, + "step": 9396 + }, + { + "epoch": 1.0319569514605755, + "grad_norm": 2.1940627098083496, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.706262469291687, + "num_tokens": 237977527.0, + "step": 9397 + }, + { + "epoch": 1.032066769163189, + "grad_norm": 2.4204485416412354, + "learning_rate": 1e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.7192893624305725, + "num_tokens": 238000278.0, + "step": 9398 + }, + { + "epoch": 1.0321765868658028, + "grad_norm": 2.435959577560425, + "learning_rate": 1e-06, + "loss": 1.0107, + "mean_token_accuracy": 0.6940338015556335, + "num_tokens": 238025235.0, + "step": 9399 + }, + { + "epoch": 1.0322864045684164, + "grad_norm": 2.343708038330078, + "learning_rate": 1e-06, + "loss": 0.8613, + "mean_token_accuracy": 0.7283729314804077, + "num_tokens": 238049883.0, + "step": 9400 + }, + { + "epoch": 1.0323962222710301, + "grad_norm": 2.5572192668914795, + "learning_rate": 1e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.7183274626731873, + "num_tokens": 238073379.0, + "step": 9401 + }, + { + "epoch": 1.0325060399736437, + "grad_norm": 2.2516160011291504, + "learning_rate": 1e-06, + "loss": 0.8905, + "mean_token_accuracy": 0.721584677696228, + "num_tokens": 238097637.0, + "step": 9402 + }, + { + "epoch": 1.0326158576762574, + "grad_norm": 2.311121940612793, + "learning_rate": 1e-06, + "loss": 0.8749, + "mean_token_accuracy": 0.7257112264633179, + "num_tokens": 238122679.0, + "step": 9403 + }, + { + "epoch": 1.0327256753788712, + "grad_norm": 2.019070863723755, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7078242301940918, + "num_tokens": 238152270.0, + "step": 9404 + }, + { + "epoch": 1.0328354930814847, + "grad_norm": 2.2329812049865723, + "learning_rate": 1e-06, + "loss": 0.853, + "mean_token_accuracy": 0.7322147488594055, + "num_tokens": 238177826.0, + "step": 9405 + }, + { + "epoch": 1.0329453107840985, + "grad_norm": 2.328009843826294, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7120704650878906, + "num_tokens": 238201608.0, + "step": 9406 + }, + { + "epoch": 1.033055128486712, + "grad_norm": 2.2002017498016357, + "learning_rate": 1e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.7099061012268066, + "num_tokens": 238226660.0, + "step": 9407 + }, + { + "epoch": 1.0331649461893258, + "grad_norm": 1.9529871940612793, + "learning_rate": 1e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.7177180051803589, + "num_tokens": 238261786.0, + "step": 9408 + }, + { + "epoch": 1.0332747638919393, + "grad_norm": 2.340304374694824, + "learning_rate": 1e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7126598358154297, + "num_tokens": 238286494.0, + "step": 9409 + }, + { + "epoch": 1.033384581594553, + "grad_norm": 2.2202603816986084, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.7067404985427856, + "num_tokens": 238312876.0, + "step": 9410 + }, + { + "epoch": 1.0334943992971668, + "grad_norm": 2.315619468688965, + "learning_rate": 1e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.7222768068313599, + "num_tokens": 238336736.0, + "step": 9411 + }, + { + "epoch": 1.0336042169997803, + "grad_norm": 2.0301272869110107, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.7129089832305908, + "num_tokens": 238364898.0, + "step": 9412 + }, + { + "epoch": 1.033714034702394, + "grad_norm": 2.3039915561676025, + "learning_rate": 1e-06, + "loss": 0.9811, + "mean_token_accuracy": 0.7026544809341431, + "num_tokens": 238390005.0, + "step": 9413 + }, + { + "epoch": 1.0338238524050076, + "grad_norm": 2.309244394302368, + "learning_rate": 1e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.7157567739486694, + "num_tokens": 238413961.0, + "step": 9414 + }, + { + "epoch": 1.0339336701076214, + "grad_norm": 2.5610179901123047, + "learning_rate": 1e-06, + "loss": 0.8809, + "mean_token_accuracy": 0.7241538763046265, + "num_tokens": 238432861.0, + "step": 9415 + }, + { + "epoch": 1.034043487810235, + "grad_norm": 2.3562309741973877, + "learning_rate": 1e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.7053656578063965, + "num_tokens": 238456502.0, + "step": 9416 + }, + { + "epoch": 1.0341533055128487, + "grad_norm": 2.5263402462005615, + "learning_rate": 1e-06, + "loss": 0.8285, + "mean_token_accuracy": 0.7397351861000061, + "num_tokens": 238475527.0, + "step": 9417 + }, + { + "epoch": 1.0342631232154624, + "grad_norm": 2.338191032409668, + "learning_rate": 1e-06, + "loss": 0.9203, + "mean_token_accuracy": 0.7185847759246826, + "num_tokens": 238498292.0, + "step": 9418 + }, + { + "epoch": 1.034372940918076, + "grad_norm": 2.485602378845215, + "learning_rate": 1e-06, + "loss": 0.9233, + "mean_token_accuracy": 0.723540186882019, + "num_tokens": 238521825.0, + "step": 9419 + }, + { + "epoch": 1.0344827586206897, + "grad_norm": 2.055518388748169, + "learning_rate": 1e-06, + "loss": 0.8432, + "mean_token_accuracy": 0.7297728657722473, + "num_tokens": 238549443.0, + "step": 9420 + }, + { + "epoch": 1.0345925763233033, + "grad_norm": 2.000723361968994, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.6996645927429199, + "num_tokens": 238579043.0, + "step": 9421 + }, + { + "epoch": 1.034702394025917, + "grad_norm": 2.4791789054870605, + "learning_rate": 1e-06, + "loss": 0.8745, + "mean_token_accuracy": 0.7214809060096741, + "num_tokens": 238600503.0, + "step": 9422 + }, + { + "epoch": 1.0348122117285306, + "grad_norm": 2.4935812950134277, + "learning_rate": 1e-06, + "loss": 0.9131, + "mean_token_accuracy": 0.7122654914855957, + "num_tokens": 238622816.0, + "step": 9423 + }, + { + "epoch": 1.0349220294311443, + "grad_norm": 2.5581307411193848, + "learning_rate": 1e-06, + "loss": 0.8187, + "mean_token_accuracy": 0.7399531602859497, + "num_tokens": 238642555.0, + "step": 9424 + }, + { + "epoch": 1.035031847133758, + "grad_norm": 2.114114999771118, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.7033204436302185, + "num_tokens": 238671792.0, + "step": 9425 + }, + { + "epoch": 1.0351416648363716, + "grad_norm": 2.096526622772217, + "learning_rate": 1e-06, + "loss": 0.8748, + "mean_token_accuracy": 0.7262226939201355, + "num_tokens": 238697803.0, + "step": 9426 + }, + { + "epoch": 1.0352514825389854, + "grad_norm": 2.035837173461914, + "learning_rate": 1e-06, + "loss": 0.9976, + "mean_token_accuracy": 0.6973183155059814, + "num_tokens": 238726521.0, + "step": 9427 + }, + { + "epoch": 1.035361300241599, + "grad_norm": 2.483720302581787, + "learning_rate": 1e-06, + "loss": 0.9007, + "mean_token_accuracy": 0.7184464931488037, + "num_tokens": 238748615.0, + "step": 9428 + }, + { + "epoch": 1.0354711179442126, + "grad_norm": 1.9550992250442505, + "learning_rate": 1e-06, + "loss": 0.9697, + "mean_token_accuracy": 0.7018284797668457, + "num_tokens": 238779023.0, + "step": 9429 + }, + { + "epoch": 1.0355809356468262, + "grad_norm": 2.296217441558838, + "learning_rate": 1e-06, + "loss": 0.8786, + "mean_token_accuracy": 0.7251332402229309, + "num_tokens": 238803012.0, + "step": 9430 + }, + { + "epoch": 1.03569075334944, + "grad_norm": 2.435241937637329, + "learning_rate": 1e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7111980319023132, + "num_tokens": 238825832.0, + "step": 9431 + }, + { + "epoch": 1.0358005710520537, + "grad_norm": 2.070608615875244, + "learning_rate": 1e-06, + "loss": 0.9661, + "mean_token_accuracy": 0.7032116055488586, + "num_tokens": 238854167.0, + "step": 9432 + }, + { + "epoch": 1.0359103887546672, + "grad_norm": 2.134120464324951, + "learning_rate": 1e-06, + "loss": 0.9946, + "mean_token_accuracy": 0.6953068971633911, + "num_tokens": 238882176.0, + "step": 9433 + }, + { + "epoch": 1.036020206457281, + "grad_norm": 2.1633481979370117, + "learning_rate": 1e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.7101944088935852, + "num_tokens": 238909976.0, + "step": 9434 + }, + { + "epoch": 1.0361300241598945, + "grad_norm": 2.0554912090301514, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7083408832550049, + "num_tokens": 238943575.0, + "step": 9435 + }, + { + "epoch": 1.0362398418625083, + "grad_norm": 2.413370132446289, + "learning_rate": 1e-06, + "loss": 0.8876, + "mean_token_accuracy": 0.721916913986206, + "num_tokens": 238966598.0, + "step": 9436 + }, + { + "epoch": 1.0363496595651218, + "grad_norm": 2.207878828048706, + "learning_rate": 1e-06, + "loss": 0.8908, + "mean_token_accuracy": 0.7251194715499878, + "num_tokens": 238993588.0, + "step": 9437 + }, + { + "epoch": 1.0364594772677356, + "grad_norm": 2.320744752883911, + "learning_rate": 1e-06, + "loss": 0.7949, + "mean_token_accuracy": 0.7556960582733154, + "num_tokens": 239014989.0, + "step": 9438 + }, + { + "epoch": 1.036569294970349, + "grad_norm": 2.131725311279297, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7163211107254028, + "num_tokens": 239043264.0, + "step": 9439 + }, + { + "epoch": 1.0366791126729629, + "grad_norm": 2.3686623573303223, + "learning_rate": 1e-06, + "loss": 0.8902, + "mean_token_accuracy": 0.723129391670227, + "num_tokens": 239065165.0, + "step": 9440 + }, + { + "epoch": 1.0367889303755766, + "grad_norm": 2.140181303024292, + "learning_rate": 1e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.714932382106781, + "num_tokens": 239093561.0, + "step": 9441 + }, + { + "epoch": 1.0368987480781902, + "grad_norm": 2.1491103172302246, + "learning_rate": 1e-06, + "loss": 0.8255, + "mean_token_accuracy": 0.7399674654006958, + "num_tokens": 239117751.0, + "step": 9442 + }, + { + "epoch": 1.037008565780804, + "grad_norm": 2.11574649810791, + "learning_rate": 1e-06, + "loss": 0.9668, + "mean_token_accuracy": 0.7019375562667847, + "num_tokens": 239149167.0, + "step": 9443 + }, + { + "epoch": 1.0371183834834174, + "grad_norm": 1.922952651977539, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.7030871510505676, + "num_tokens": 239184315.0, + "step": 9444 + }, + { + "epoch": 1.0372282011860312, + "grad_norm": 2.1727294921875, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7233681678771973, + "num_tokens": 239212043.0, + "step": 9445 + }, + { + "epoch": 1.037338018888645, + "grad_norm": 2.4200005531311035, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.7178486585617065, + "num_tokens": 239234896.0, + "step": 9446 + }, + { + "epoch": 1.0374478365912585, + "grad_norm": 2.4847118854522705, + "learning_rate": 1e-06, + "loss": 0.8436, + "mean_token_accuracy": 0.7309417724609375, + "num_tokens": 239256116.0, + "step": 9447 + }, + { + "epoch": 1.0375576542938723, + "grad_norm": 2.5245518684387207, + "learning_rate": 1e-06, + "loss": 0.8738, + "mean_token_accuracy": 0.7221472263336182, + "num_tokens": 239276685.0, + "step": 9448 + }, + { + "epoch": 1.0376674719964858, + "grad_norm": 2.3812129497528076, + "learning_rate": 1e-06, + "loss": 0.9607, + "mean_token_accuracy": 0.7018445134162903, + "num_tokens": 239300939.0, + "step": 9449 + }, + { + "epoch": 1.0377772896990995, + "grad_norm": 2.1920249462127686, + "learning_rate": 1e-06, + "loss": 0.86, + "mean_token_accuracy": 0.7325001955032349, + "num_tokens": 239325813.0, + "step": 9450 + }, + { + "epoch": 1.037887107401713, + "grad_norm": 2.310807704925537, + "learning_rate": 1e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.7131931781768799, + "num_tokens": 239348977.0, + "step": 9451 + }, + { + "epoch": 1.0379969251043268, + "grad_norm": 2.328381299972534, + "learning_rate": 1e-06, + "loss": 0.9202, + "mean_token_accuracy": 0.7170066833496094, + "num_tokens": 239372687.0, + "step": 9452 + }, + { + "epoch": 1.0381067428069404, + "grad_norm": 2.2109527587890625, + "learning_rate": 1e-06, + "loss": 0.8687, + "mean_token_accuracy": 0.7327291369438171, + "num_tokens": 239397893.0, + "step": 9453 + }, + { + "epoch": 1.0382165605095541, + "grad_norm": 2.280956983566284, + "learning_rate": 1e-06, + "loss": 0.8968, + "mean_token_accuracy": 0.7172157168388367, + "num_tokens": 239423681.0, + "step": 9454 + }, + { + "epoch": 1.0383263782121679, + "grad_norm": 2.551567316055298, + "learning_rate": 1e-06, + "loss": 0.8683, + "mean_token_accuracy": 0.7274700999259949, + "num_tokens": 239442855.0, + "step": 9455 + }, + { + "epoch": 1.0384361959147814, + "grad_norm": 2.4194741249084473, + "learning_rate": 1e-06, + "loss": 0.8059, + "mean_token_accuracy": 0.7372013330459595, + "num_tokens": 239463861.0, + "step": 9456 + }, + { + "epoch": 1.0385460136173952, + "grad_norm": 2.574063777923584, + "learning_rate": 1e-06, + "loss": 0.9082, + "mean_token_accuracy": 0.7178969979286194, + "num_tokens": 239482646.0, + "step": 9457 + }, + { + "epoch": 1.0386558313200087, + "grad_norm": 2.3151051998138428, + "learning_rate": 1e-06, + "loss": 0.8921, + "mean_token_accuracy": 0.7245599627494812, + "num_tokens": 239506179.0, + "step": 9458 + }, + { + "epoch": 1.0387656490226225, + "grad_norm": 1.9808768033981323, + "learning_rate": 1e-06, + "loss": 0.9006, + "mean_token_accuracy": 0.7258150577545166, + "num_tokens": 239537808.0, + "step": 9459 + }, + { + "epoch": 1.038875466725236, + "grad_norm": 2.53977370262146, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.7115505933761597, + "num_tokens": 239557875.0, + "step": 9460 + }, + { + "epoch": 1.0389852844278498, + "grad_norm": 2.44339919090271, + "learning_rate": 1e-06, + "loss": 0.8419, + "mean_token_accuracy": 0.7426052689552307, + "num_tokens": 239578977.0, + "step": 9461 + }, + { + "epoch": 1.0390951021304635, + "grad_norm": 2.227950096130371, + "learning_rate": 1e-06, + "loss": 0.9219, + "mean_token_accuracy": 0.7148157954216003, + "num_tokens": 239606003.0, + "step": 9462 + }, + { + "epoch": 1.039204919833077, + "grad_norm": 2.207158088684082, + "learning_rate": 1e-06, + "loss": 0.9205, + "mean_token_accuracy": 0.712134063243866, + "num_tokens": 239630981.0, + "step": 9463 + }, + { + "epoch": 1.0393147375356908, + "grad_norm": 2.699765920639038, + "learning_rate": 1e-06, + "loss": 0.8517, + "mean_token_accuracy": 0.7296535968780518, + "num_tokens": 239651645.0, + "step": 9464 + }, + { + "epoch": 1.0394245552383043, + "grad_norm": 2.321056604385376, + "learning_rate": 1e-06, + "loss": 0.8938, + "mean_token_accuracy": 0.724880576133728, + "num_tokens": 239677413.0, + "step": 9465 + }, + { + "epoch": 1.039534372940918, + "grad_norm": 2.134317398071289, + "learning_rate": 1e-06, + "loss": 0.9366, + "mean_token_accuracy": 0.7090764045715332, + "num_tokens": 239706505.0, + "step": 9466 + }, + { + "epoch": 1.0396441906435316, + "grad_norm": 2.317594051361084, + "learning_rate": 1e-06, + "loss": 0.8742, + "mean_token_accuracy": 0.7259347438812256, + "num_tokens": 239731134.0, + "step": 9467 + }, + { + "epoch": 1.0397540083461454, + "grad_norm": 2.2670161724090576, + "learning_rate": 1e-06, + "loss": 0.8542, + "mean_token_accuracy": 0.729231595993042, + "num_tokens": 239757762.0, + "step": 9468 + }, + { + "epoch": 1.0398638260487592, + "grad_norm": 2.1273908615112305, + "learning_rate": 1e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.6933807134628296, + "num_tokens": 239785666.0, + "step": 9469 + }, + { + "epoch": 1.0399736437513727, + "grad_norm": 2.031923294067383, + "learning_rate": 1e-06, + "loss": 0.9473, + "mean_token_accuracy": 0.7104409337043762, + "num_tokens": 239816436.0, + "step": 9470 + }, + { + "epoch": 1.0400834614539864, + "grad_norm": 2.719238042831421, + "learning_rate": 1e-06, + "loss": 0.8829, + "mean_token_accuracy": 0.7272621989250183, + "num_tokens": 239835730.0, + "step": 9471 + }, + { + "epoch": 1.0401932791566, + "grad_norm": 2.06872820854187, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.703731894493103, + "num_tokens": 239865425.0, + "step": 9472 + }, + { + "epoch": 1.0403030968592137, + "grad_norm": 2.2440435886383057, + "learning_rate": 1e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.7010658979415894, + "num_tokens": 239891059.0, + "step": 9473 + }, + { + "epoch": 1.0404129145618273, + "grad_norm": 2.3488528728485107, + "learning_rate": 1e-06, + "loss": 0.8645, + "mean_token_accuracy": 0.7339677214622498, + "num_tokens": 239913756.0, + "step": 9474 + }, + { + "epoch": 1.040522732264441, + "grad_norm": 2.3674588203430176, + "learning_rate": 1e-06, + "loss": 0.8711, + "mean_token_accuracy": 0.7269543409347534, + "num_tokens": 239936767.0, + "step": 9475 + }, + { + "epoch": 1.0406325499670548, + "grad_norm": 2.7104508876800537, + "learning_rate": 1e-06, + "loss": 0.9198, + "mean_token_accuracy": 0.7107698917388916, + "num_tokens": 239955186.0, + "step": 9476 + }, + { + "epoch": 1.0407423676696683, + "grad_norm": 2.1078665256500244, + "learning_rate": 1e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7131447792053223, + "num_tokens": 239982509.0, + "step": 9477 + }, + { + "epoch": 1.040852185372282, + "grad_norm": 1.9092047214508057, + "learning_rate": 1e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.7174899578094482, + "num_tokens": 240015635.0, + "step": 9478 + }, + { + "epoch": 1.0409620030748956, + "grad_norm": 2.3449807167053223, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.7111495137214661, + "num_tokens": 240039212.0, + "step": 9479 + }, + { + "epoch": 1.0410718207775094, + "grad_norm": 2.2305994033813477, + "learning_rate": 1e-06, + "loss": 0.9499, + "mean_token_accuracy": 0.7045339345932007, + "num_tokens": 240065440.0, + "step": 9480 + }, + { + "epoch": 1.041181638480123, + "grad_norm": 2.00785756111145, + "learning_rate": 1e-06, + "loss": 0.8797, + "mean_token_accuracy": 0.7286062240600586, + "num_tokens": 240095919.0, + "step": 9481 + }, + { + "epoch": 1.0412914561827367, + "grad_norm": 2.0727362632751465, + "learning_rate": 1e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7211391925811768, + "num_tokens": 240125037.0, + "step": 9482 + }, + { + "epoch": 1.0414012738853504, + "grad_norm": 2.223146915435791, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.713245689868927, + "num_tokens": 240150530.0, + "step": 9483 + }, + { + "epoch": 1.041511091587964, + "grad_norm": 2.3166286945343018, + "learning_rate": 1e-06, + "loss": 0.9544, + "mean_token_accuracy": 0.7008261680603027, + "num_tokens": 240175155.0, + "step": 9484 + }, + { + "epoch": 1.0416209092905777, + "grad_norm": 1.9871256351470947, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7160961627960205, + "num_tokens": 240206687.0, + "step": 9485 + }, + { + "epoch": 1.0417307269931912, + "grad_norm": 2.445695638656616, + "learning_rate": 1e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.7152411937713623, + "num_tokens": 240229645.0, + "step": 9486 + }, + { + "epoch": 1.041840544695805, + "grad_norm": 2.315070152282715, + "learning_rate": 1e-06, + "loss": 0.808, + "mean_token_accuracy": 0.7429909706115723, + "num_tokens": 240251530.0, + "step": 9487 + }, + { + "epoch": 1.0419503623984185, + "grad_norm": 1.9897949695587158, + "learning_rate": 1e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.7040446996688843, + "num_tokens": 240281825.0, + "step": 9488 + }, + { + "epoch": 1.0420601801010323, + "grad_norm": 2.133122205734253, + "learning_rate": 1e-06, + "loss": 0.8624, + "mean_token_accuracy": 0.7267922759056091, + "num_tokens": 240309196.0, + "step": 9489 + }, + { + "epoch": 1.042169997803646, + "grad_norm": 2.3277385234832764, + "learning_rate": 1e-06, + "loss": 0.8838, + "mean_token_accuracy": 0.7200425267219543, + "num_tokens": 240332380.0, + "step": 9490 + }, + { + "epoch": 1.0422798155062596, + "grad_norm": 2.393618106842041, + "learning_rate": 1e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.7217133045196533, + "num_tokens": 240356874.0, + "step": 9491 + }, + { + "epoch": 1.0423896332088733, + "grad_norm": 2.2880196571350098, + "learning_rate": 1e-06, + "loss": 0.9068, + "mean_token_accuracy": 0.7217689752578735, + "num_tokens": 240380566.0, + "step": 9492 + }, + { + "epoch": 1.0424994509114869, + "grad_norm": 2.8052568435668945, + "learning_rate": 1e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.7114852666854858, + "num_tokens": 240401744.0, + "step": 9493 + }, + { + "epoch": 1.0426092686141006, + "grad_norm": 2.4497456550598145, + "learning_rate": 1e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.7024397850036621, + "num_tokens": 240422329.0, + "step": 9494 + }, + { + "epoch": 1.0427190863167142, + "grad_norm": 1.9144201278686523, + "learning_rate": 1e-06, + "loss": 0.946, + "mean_token_accuracy": 0.7204113006591797, + "num_tokens": 240454114.0, + "step": 9495 + }, + { + "epoch": 1.042828904019328, + "grad_norm": 2.1304657459259033, + "learning_rate": 1e-06, + "loss": 0.8439, + "mean_token_accuracy": 0.7316387891769409, + "num_tokens": 240480529.0, + "step": 9496 + }, + { + "epoch": 1.0429387217219417, + "grad_norm": 2.4826581478118896, + "learning_rate": 1e-06, + "loss": 0.8304, + "mean_token_accuracy": 0.736385703086853, + "num_tokens": 240503507.0, + "step": 9497 + }, + { + "epoch": 1.0430485394245552, + "grad_norm": 2.4236440658569336, + "learning_rate": 1e-06, + "loss": 0.8453, + "mean_token_accuracy": 0.7372627258300781, + "num_tokens": 240525143.0, + "step": 9498 + }, + { + "epoch": 1.043158357127169, + "grad_norm": 2.0970516204833984, + "learning_rate": 1e-06, + "loss": 0.9172, + "mean_token_accuracy": 0.7129809856414795, + "num_tokens": 240554800.0, + "step": 9499 + }, + { + "epoch": 1.0432681748297825, + "grad_norm": 2.396523952484131, + "learning_rate": 1e-06, + "loss": 0.8651, + "mean_token_accuracy": 0.7274199724197388, + "num_tokens": 240577722.0, + "step": 9500 + }, + { + "epoch": 1.0433779925323963, + "grad_norm": 2.2447469234466553, + "learning_rate": 1e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7046583294868469, + "num_tokens": 240601931.0, + "step": 9501 + }, + { + "epoch": 1.0434878102350098, + "grad_norm": 2.323854446411133, + "learning_rate": 1e-06, + "loss": 0.8499, + "mean_token_accuracy": 0.7320756316184998, + "num_tokens": 240624072.0, + "step": 9502 + }, + { + "epoch": 1.0435976279376236, + "grad_norm": 2.456666946411133, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.7116968631744385, + "num_tokens": 240646020.0, + "step": 9503 + }, + { + "epoch": 1.043707445640237, + "grad_norm": 2.2604832649230957, + "learning_rate": 1e-06, + "loss": 0.8329, + "mean_token_accuracy": 0.7322307825088501, + "num_tokens": 240670163.0, + "step": 9504 + }, + { + "epoch": 1.0438172633428509, + "grad_norm": 2.4287497997283936, + "learning_rate": 1e-06, + "loss": 0.8883, + "mean_token_accuracy": 0.7232872247695923, + "num_tokens": 240692429.0, + "step": 9505 + }, + { + "epoch": 1.0439270810454646, + "grad_norm": 2.511704206466675, + "learning_rate": 1e-06, + "loss": 0.8879, + "mean_token_accuracy": 0.7212386131286621, + "num_tokens": 240715222.0, + "step": 9506 + }, + { + "epoch": 1.0440368987480781, + "grad_norm": 2.238680124282837, + "learning_rate": 1e-06, + "loss": 0.8872, + "mean_token_accuracy": 0.7209164500236511, + "num_tokens": 240740193.0, + "step": 9507 + }, + { + "epoch": 1.044146716450692, + "grad_norm": 2.4269192218780518, + "learning_rate": 1e-06, + "loss": 0.9008, + "mean_token_accuracy": 0.7215962409973145, + "num_tokens": 240763419.0, + "step": 9508 + }, + { + "epoch": 1.0442565341533054, + "grad_norm": 2.009636640548706, + "learning_rate": 1e-06, + "loss": 0.897, + "mean_token_accuracy": 0.7171288728713989, + "num_tokens": 240793539.0, + "step": 9509 + }, + { + "epoch": 1.0443663518559192, + "grad_norm": 2.3633861541748047, + "learning_rate": 1e-06, + "loss": 0.8741, + "mean_token_accuracy": 0.7371079325675964, + "num_tokens": 240816327.0, + "step": 9510 + }, + { + "epoch": 1.0444761695585327, + "grad_norm": 2.2483813762664795, + "learning_rate": 1e-06, + "loss": 0.8492, + "mean_token_accuracy": 0.7330970764160156, + "num_tokens": 240839974.0, + "step": 9511 + }, + { + "epoch": 1.0445859872611465, + "grad_norm": 2.6609041690826416, + "learning_rate": 1e-06, + "loss": 0.8196, + "mean_token_accuracy": 0.7394130825996399, + "num_tokens": 240858409.0, + "step": 9512 + }, + { + "epoch": 1.0446958049637602, + "grad_norm": 2.2148091793060303, + "learning_rate": 1e-06, + "loss": 0.8595, + "mean_token_accuracy": 0.7272604703903198, + "num_tokens": 240883592.0, + "step": 9513 + }, + { + "epoch": 1.0448056226663738, + "grad_norm": 2.2475087642669678, + "learning_rate": 1e-06, + "loss": 0.8862, + "mean_token_accuracy": 0.7278735637664795, + "num_tokens": 240907102.0, + "step": 9514 + }, + { + "epoch": 1.0449154403689875, + "grad_norm": 2.1583940982818604, + "learning_rate": 1e-06, + "loss": 0.9697, + "mean_token_accuracy": 0.6952012777328491, + "num_tokens": 240934372.0, + "step": 9515 + }, + { + "epoch": 1.045025258071601, + "grad_norm": 2.182269334793091, + "learning_rate": 1e-06, + "loss": 0.833, + "mean_token_accuracy": 0.7366195917129517, + "num_tokens": 240959231.0, + "step": 9516 + }, + { + "epoch": 1.0451350757742148, + "grad_norm": 2.421311140060425, + "learning_rate": 1e-06, + "loss": 0.8632, + "mean_token_accuracy": 0.724215030670166, + "num_tokens": 240982480.0, + "step": 9517 + }, + { + "epoch": 1.0452448934768284, + "grad_norm": 2.3301186561584473, + "learning_rate": 1e-06, + "loss": 1.0154, + "mean_token_accuracy": 0.6885405778884888, + "num_tokens": 241009750.0, + "step": 9518 + }, + { + "epoch": 1.0453547111794421, + "grad_norm": 2.3851683139801025, + "learning_rate": 1e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7034614682197571, + "num_tokens": 241033874.0, + "step": 9519 + }, + { + "epoch": 1.0454645288820559, + "grad_norm": 2.083576202392578, + "learning_rate": 1e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.7034521102905273, + "num_tokens": 241063617.0, + "step": 9520 + }, + { + "epoch": 1.0455743465846694, + "grad_norm": 2.1860287189483643, + "learning_rate": 1e-06, + "loss": 0.9722, + "mean_token_accuracy": 0.7031475901603699, + "num_tokens": 241093299.0, + "step": 9521 + }, + { + "epoch": 1.0456841642872832, + "grad_norm": 2.166170597076416, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7128131985664368, + "num_tokens": 241119860.0, + "step": 9522 + }, + { + "epoch": 1.0457939819898967, + "grad_norm": 2.126356840133667, + "learning_rate": 1e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.698460578918457, + "num_tokens": 241149065.0, + "step": 9523 + }, + { + "epoch": 1.0459037996925105, + "grad_norm": 1.9485526084899902, + "learning_rate": 1e-06, + "loss": 0.8446, + "mean_token_accuracy": 0.7370275855064392, + "num_tokens": 241179738.0, + "step": 9524 + }, + { + "epoch": 1.046013617395124, + "grad_norm": 2.4508864879608154, + "learning_rate": 1e-06, + "loss": 0.8834, + "mean_token_accuracy": 0.7253915667533875, + "num_tokens": 241201381.0, + "step": 9525 + }, + { + "epoch": 1.0461234350977378, + "grad_norm": 2.165560245513916, + "learning_rate": 1e-06, + "loss": 0.8961, + "mean_token_accuracy": 0.7281809449195862, + "num_tokens": 241228360.0, + "step": 9526 + }, + { + "epoch": 1.0462332528003515, + "grad_norm": 2.3948147296905518, + "learning_rate": 1e-06, + "loss": 0.9068, + "mean_token_accuracy": 0.7187479138374329, + "num_tokens": 241252852.0, + "step": 9527 + }, + { + "epoch": 1.046343070502965, + "grad_norm": 2.2343246936798096, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7150670289993286, + "num_tokens": 241277708.0, + "step": 9528 + }, + { + "epoch": 1.0464528882055788, + "grad_norm": 2.049889087677002, + "learning_rate": 1e-06, + "loss": 0.8774, + "mean_token_accuracy": 0.730139434337616, + "num_tokens": 241306234.0, + "step": 9529 + }, + { + "epoch": 1.0465627059081923, + "grad_norm": 2.107823371887207, + "learning_rate": 1e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.7124648094177246, + "num_tokens": 241334972.0, + "step": 9530 + }, + { + "epoch": 1.046672523610806, + "grad_norm": 2.0581533908843994, + "learning_rate": 1e-06, + "loss": 0.9788, + "mean_token_accuracy": 0.7186723947525024, + "num_tokens": 241364853.0, + "step": 9531 + }, + { + "epoch": 1.0467823413134196, + "grad_norm": 2.603076934814453, + "learning_rate": 1e-06, + "loss": 0.8482, + "mean_token_accuracy": 0.7387493252754211, + "num_tokens": 241383791.0, + "step": 9532 + }, + { + "epoch": 1.0468921590160334, + "grad_norm": 2.446331739425659, + "learning_rate": 1e-06, + "loss": 0.8229, + "mean_token_accuracy": 0.7372987270355225, + "num_tokens": 241406352.0, + "step": 9533 + }, + { + "epoch": 1.0470019767186471, + "grad_norm": 2.222007989883423, + "learning_rate": 1e-06, + "loss": 0.8105, + "mean_token_accuracy": 0.7441173195838928, + "num_tokens": 241431133.0, + "step": 9534 + }, + { + "epoch": 1.0471117944212607, + "grad_norm": 2.4052958488464355, + "learning_rate": 1e-06, + "loss": 0.8663, + "mean_token_accuracy": 0.7345982789993286, + "num_tokens": 241452637.0, + "step": 9535 + }, + { + "epoch": 1.0472216121238744, + "grad_norm": 2.243324041366577, + "learning_rate": 1e-06, + "loss": 0.9368, + "mean_token_accuracy": 0.7092376947402954, + "num_tokens": 241477692.0, + "step": 9536 + }, + { + "epoch": 1.047331429826488, + "grad_norm": 2.1702959537506104, + "learning_rate": 1e-06, + "loss": 0.9224, + "mean_token_accuracy": 0.7201826572418213, + "num_tokens": 241503507.0, + "step": 9537 + }, + { + "epoch": 1.0474412475291017, + "grad_norm": 2.2318599224090576, + "learning_rate": 1e-06, + "loss": 0.8731, + "mean_token_accuracy": 0.7208378314971924, + "num_tokens": 241531252.0, + "step": 9538 + }, + { + "epoch": 1.0475510652317153, + "grad_norm": 2.194617748260498, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7107436656951904, + "num_tokens": 241558730.0, + "step": 9539 + }, + { + "epoch": 1.047660882934329, + "grad_norm": 2.2438747882843018, + "learning_rate": 1e-06, + "loss": 0.865, + "mean_token_accuracy": 0.7264732122421265, + "num_tokens": 241585434.0, + "step": 9540 + }, + { + "epoch": 1.0477707006369428, + "grad_norm": 2.426774024963379, + "learning_rate": 1e-06, + "loss": 0.8628, + "mean_token_accuracy": 0.7239168882369995, + "num_tokens": 241606173.0, + "step": 9541 + }, + { + "epoch": 1.0478805183395563, + "grad_norm": 2.7937729358673096, + "learning_rate": 1e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.7219571471214294, + "num_tokens": 241624030.0, + "step": 9542 + }, + { + "epoch": 1.04799033604217, + "grad_norm": 2.0993082523345947, + "learning_rate": 1e-06, + "loss": 1.029, + "mean_token_accuracy": 0.6831742525100708, + "num_tokens": 241656192.0, + "step": 9543 + }, + { + "epoch": 1.0481001537447836, + "grad_norm": 2.2365007400512695, + "learning_rate": 1e-06, + "loss": 1.0201, + "mean_token_accuracy": 0.689988911151886, + "num_tokens": 241683616.0, + "step": 9544 + }, + { + "epoch": 1.0482099714473974, + "grad_norm": 2.4069406986236572, + "learning_rate": 1e-06, + "loss": 0.787, + "mean_token_accuracy": 0.7458748817443848, + "num_tokens": 241704733.0, + "step": 9545 + }, + { + "epoch": 1.048319789150011, + "grad_norm": 2.2676825523376465, + "learning_rate": 1e-06, + "loss": 0.9809, + "mean_token_accuracy": 0.6977008581161499, + "num_tokens": 241730210.0, + "step": 9546 + }, + { + "epoch": 1.0484296068526247, + "grad_norm": 2.1564440727233887, + "learning_rate": 1e-06, + "loss": 0.9603, + "mean_token_accuracy": 0.7045630216598511, + "num_tokens": 241758888.0, + "step": 9547 + }, + { + "epoch": 1.0485394245552384, + "grad_norm": 2.2041244506835938, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7032032012939453, + "num_tokens": 241784949.0, + "step": 9548 + }, + { + "epoch": 1.048649242257852, + "grad_norm": 2.1205332279205322, + "learning_rate": 1e-06, + "loss": 0.8671, + "mean_token_accuracy": 0.7382282614707947, + "num_tokens": 241811538.0, + "step": 9549 + }, + { + "epoch": 1.0487590599604657, + "grad_norm": 2.4771862030029297, + "learning_rate": 1e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.7139225006103516, + "num_tokens": 241831753.0, + "step": 9550 + }, + { + "epoch": 1.0488688776630792, + "grad_norm": 2.349346160888672, + "learning_rate": 1e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.7141743898391724, + "num_tokens": 241854264.0, + "step": 9551 + }, + { + "epoch": 1.048978695365693, + "grad_norm": 2.3719539642333984, + "learning_rate": 1e-06, + "loss": 0.9098, + "mean_token_accuracy": 0.7136446237564087, + "num_tokens": 241876701.0, + "step": 9552 + }, + { + "epoch": 1.0490885130683065, + "grad_norm": 2.5245161056518555, + "learning_rate": 1e-06, + "loss": 0.8291, + "mean_token_accuracy": 0.7388490438461304, + "num_tokens": 241897050.0, + "step": 9553 + }, + { + "epoch": 1.0491983307709203, + "grad_norm": 2.284311532974243, + "learning_rate": 1e-06, + "loss": 0.8232, + "mean_token_accuracy": 0.7496480941772461, + "num_tokens": 241920646.0, + "step": 9554 + }, + { + "epoch": 1.0493081484735338, + "grad_norm": 2.176647186279297, + "learning_rate": 1e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.7014178037643433, + "num_tokens": 241948409.0, + "step": 9555 + }, + { + "epoch": 1.0494179661761476, + "grad_norm": 2.342684745788574, + "learning_rate": 1e-06, + "loss": 0.9751, + "mean_token_accuracy": 0.6984158754348755, + "num_tokens": 241971942.0, + "step": 9556 + }, + { + "epoch": 1.0495277838787613, + "grad_norm": 1.9439313411712646, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7153540849685669, + "num_tokens": 242004087.0, + "step": 9557 + }, + { + "epoch": 1.0496376015813749, + "grad_norm": 2.1407413482666016, + "learning_rate": 1e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.7009904384613037, + "num_tokens": 242031916.0, + "step": 9558 + }, + { + "epoch": 1.0497474192839886, + "grad_norm": 2.2477855682373047, + "learning_rate": 1e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.7068361639976501, + "num_tokens": 242058227.0, + "step": 9559 + }, + { + "epoch": 1.0498572369866022, + "grad_norm": 2.4734086990356445, + "learning_rate": 1e-06, + "loss": 0.8706, + "mean_token_accuracy": 0.7210850715637207, + "num_tokens": 242079364.0, + "step": 9560 + }, + { + "epoch": 1.049967054689216, + "grad_norm": 2.4463050365448, + "learning_rate": 1e-06, + "loss": 0.8993, + "mean_token_accuracy": 0.7209555506706238, + "num_tokens": 242102532.0, + "step": 9561 + }, + { + "epoch": 1.0500768723918297, + "grad_norm": 1.8662787675857544, + "learning_rate": 1e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.7111706137657166, + "num_tokens": 242136919.0, + "step": 9562 + }, + { + "epoch": 1.0501866900944432, + "grad_norm": 2.393706798553467, + "learning_rate": 1e-06, + "loss": 0.8966, + "mean_token_accuracy": 0.7310351133346558, + "num_tokens": 242158540.0, + "step": 9563 + }, + { + "epoch": 1.050296507797057, + "grad_norm": 2.530761957168579, + "learning_rate": 1e-06, + "loss": 0.8965, + "mean_token_accuracy": 0.7265256643295288, + "num_tokens": 242179387.0, + "step": 9564 + }, + { + "epoch": 1.0504063254996705, + "grad_norm": 2.032674551010132, + "learning_rate": 1e-06, + "loss": 0.9012, + "mean_token_accuracy": 0.7194185256958008, + "num_tokens": 242211812.0, + "step": 9565 + }, + { + "epoch": 1.0505161432022843, + "grad_norm": 2.0063154697418213, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7020787000656128, + "num_tokens": 242244347.0, + "step": 9566 + }, + { + "epoch": 1.0506259609048978, + "grad_norm": 2.2811174392700195, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7156227231025696, + "num_tokens": 242268213.0, + "step": 9567 + }, + { + "epoch": 1.0507357786075116, + "grad_norm": 2.4785878658294678, + "learning_rate": 1e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.7325940728187561, + "num_tokens": 242290099.0, + "step": 9568 + }, + { + "epoch": 1.050845596310125, + "grad_norm": 2.398273229598999, + "learning_rate": 1e-06, + "loss": 0.922, + "mean_token_accuracy": 0.7104012966156006, + "num_tokens": 242313228.0, + "step": 9569 + }, + { + "epoch": 1.0509554140127388, + "grad_norm": 2.562051773071289, + "learning_rate": 1e-06, + "loss": 0.8058, + "mean_token_accuracy": 0.7460682392120361, + "num_tokens": 242332134.0, + "step": 9570 + }, + { + "epoch": 1.0510652317153526, + "grad_norm": 2.3887135982513428, + "learning_rate": 1e-06, + "loss": 0.9954, + "mean_token_accuracy": 0.6947535276412964, + "num_tokens": 242357596.0, + "step": 9571 + }, + { + "epoch": 1.0511750494179661, + "grad_norm": 2.3787472248077393, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7177919149398804, + "num_tokens": 242380767.0, + "step": 9572 + }, + { + "epoch": 1.05128486712058, + "grad_norm": 2.1325347423553467, + "learning_rate": 1e-06, + "loss": 0.7576, + "mean_token_accuracy": 0.7575550675392151, + "num_tokens": 242407447.0, + "step": 9573 + }, + { + "epoch": 1.0513946848231934, + "grad_norm": 2.1324350833892822, + "learning_rate": 1e-06, + "loss": 0.8969, + "mean_token_accuracy": 0.7144286036491394, + "num_tokens": 242433534.0, + "step": 9574 + }, + { + "epoch": 1.0515045025258072, + "grad_norm": 2.4110918045043945, + "learning_rate": 1e-06, + "loss": 0.8966, + "mean_token_accuracy": 0.7255722284317017, + "num_tokens": 242456279.0, + "step": 9575 + }, + { + "epoch": 1.0516143202284207, + "grad_norm": 2.157015562057495, + "learning_rate": 1e-06, + "loss": 0.8476, + "mean_token_accuracy": 0.7342365384101868, + "num_tokens": 242482121.0, + "step": 9576 + }, + { + "epoch": 1.0517241379310345, + "grad_norm": 2.2313337326049805, + "learning_rate": 1e-06, + "loss": 0.9268, + "mean_token_accuracy": 0.713153600692749, + "num_tokens": 242507959.0, + "step": 9577 + }, + { + "epoch": 1.0518339556336482, + "grad_norm": 2.2225306034088135, + "learning_rate": 1e-06, + "loss": 0.9033, + "mean_token_accuracy": 0.7202067375183105, + "num_tokens": 242533537.0, + "step": 9578 + }, + { + "epoch": 1.0519437733362618, + "grad_norm": 2.5489020347595215, + "learning_rate": 1e-06, + "loss": 0.8536, + "mean_token_accuracy": 0.7370425462722778, + "num_tokens": 242552399.0, + "step": 9579 + }, + { + "epoch": 1.0520535910388755, + "grad_norm": 2.263908624649048, + "learning_rate": 1e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7174044847488403, + "num_tokens": 242577429.0, + "step": 9580 + }, + { + "epoch": 1.052163408741489, + "grad_norm": 2.4637715816497803, + "learning_rate": 1e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.7302199602127075, + "num_tokens": 242599376.0, + "step": 9581 + }, + { + "epoch": 1.0522732264441028, + "grad_norm": 2.185387134552002, + "learning_rate": 1e-06, + "loss": 0.9032, + "mean_token_accuracy": 0.7218490242958069, + "num_tokens": 242626076.0, + "step": 9582 + }, + { + "epoch": 1.0523830441467164, + "grad_norm": 2.1687493324279785, + "learning_rate": 1e-06, + "loss": 1.0213, + "mean_token_accuracy": 0.6925815343856812, + "num_tokens": 242653395.0, + "step": 9583 + }, + { + "epoch": 1.05249286184933, + "grad_norm": 2.0936760902404785, + "learning_rate": 1e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.6964794397354126, + "num_tokens": 242683494.0, + "step": 9584 + }, + { + "epoch": 1.0526026795519439, + "grad_norm": 2.1582984924316406, + "learning_rate": 1e-06, + "loss": 0.8755, + "mean_token_accuracy": 0.7229738235473633, + "num_tokens": 242709798.0, + "step": 9585 + }, + { + "epoch": 1.0527124972545574, + "grad_norm": 2.525031805038452, + "learning_rate": 1e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.7119898796081543, + "num_tokens": 242731525.0, + "step": 9586 + }, + { + "epoch": 1.0528223149571712, + "grad_norm": 2.4020779132843018, + "learning_rate": 1e-06, + "loss": 0.8923, + "mean_token_accuracy": 0.7265101671218872, + "num_tokens": 242754728.0, + "step": 9587 + }, + { + "epoch": 1.0529321326597847, + "grad_norm": 2.0070419311523438, + "learning_rate": 1e-06, + "loss": 0.8837, + "mean_token_accuracy": 0.7200716137886047, + "num_tokens": 242784871.0, + "step": 9588 + }, + { + "epoch": 1.0530419503623984, + "grad_norm": 2.634725570678711, + "learning_rate": 1e-06, + "loss": 0.779, + "mean_token_accuracy": 0.7467689514160156, + "num_tokens": 242804888.0, + "step": 9589 + }, + { + "epoch": 1.053151768065012, + "grad_norm": 1.976065993309021, + "learning_rate": 1e-06, + "loss": 1.0118, + "mean_token_accuracy": 0.6935402154922485, + "num_tokens": 242837946.0, + "step": 9590 + }, + { + "epoch": 1.0532615857676257, + "grad_norm": 2.465695381164551, + "learning_rate": 1e-06, + "loss": 0.9016, + "mean_token_accuracy": 0.7244080305099487, + "num_tokens": 242861196.0, + "step": 9591 + }, + { + "epoch": 1.0533714034702395, + "grad_norm": 2.4188520908355713, + "learning_rate": 1e-06, + "loss": 0.9444, + "mean_token_accuracy": 0.7066292762756348, + "num_tokens": 242885155.0, + "step": 9592 + }, + { + "epoch": 1.053481221172853, + "grad_norm": 2.134974241256714, + "learning_rate": 1e-06, + "loss": 0.7791, + "mean_token_accuracy": 0.7535837888717651, + "num_tokens": 242910696.0, + "step": 9593 + }, + { + "epoch": 1.0535910388754668, + "grad_norm": 2.1835715770721436, + "learning_rate": 1e-06, + "loss": 0.9651, + "mean_token_accuracy": 0.7022066712379456, + "num_tokens": 242939107.0, + "step": 9594 + }, + { + "epoch": 1.0537008565780803, + "grad_norm": 2.0020556449890137, + "learning_rate": 1e-06, + "loss": 1.0107, + "mean_token_accuracy": 0.6867080926895142, + "num_tokens": 242973222.0, + "step": 9595 + }, + { + "epoch": 1.053810674280694, + "grad_norm": 2.0697999000549316, + "learning_rate": 1e-06, + "loss": 0.8553, + "mean_token_accuracy": 0.7322444915771484, + "num_tokens": 243001374.0, + "step": 9596 + }, + { + "epoch": 1.0539204919833076, + "grad_norm": 2.220463514328003, + "learning_rate": 1e-06, + "loss": 0.845, + "mean_token_accuracy": 0.7298173904418945, + "num_tokens": 243026061.0, + "step": 9597 + }, + { + "epoch": 1.0540303096859214, + "grad_norm": 2.251188278198242, + "learning_rate": 1e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.6954504251480103, + "num_tokens": 243051815.0, + "step": 9598 + }, + { + "epoch": 1.0541401273885351, + "grad_norm": 1.9826122522354126, + "learning_rate": 1e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.7058147192001343, + "num_tokens": 243082627.0, + "step": 9599 + }, + { + "epoch": 1.0542499450911487, + "grad_norm": 1.8306385278701782, + "learning_rate": 1e-06, + "loss": 1.0132, + "mean_token_accuracy": 0.6870061755180359, + "num_tokens": 243120190.0, + "step": 9600 + }, + { + "epoch": 1.0543597627937624, + "grad_norm": 2.5004241466522217, + "learning_rate": 1e-06, + "loss": 0.902, + "mean_token_accuracy": 0.7183040380477905, + "num_tokens": 243141731.0, + "step": 9601 + }, + { + "epoch": 1.054469580496376, + "grad_norm": 2.3969955444335938, + "learning_rate": 1e-06, + "loss": 0.8804, + "mean_token_accuracy": 0.7227823734283447, + "num_tokens": 243165238.0, + "step": 9602 + }, + { + "epoch": 1.0545793981989897, + "grad_norm": 2.4730403423309326, + "learning_rate": 1e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7184720635414124, + "num_tokens": 243188164.0, + "step": 9603 + }, + { + "epoch": 1.0546892159016032, + "grad_norm": 2.283865213394165, + "learning_rate": 1e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.724777102470398, + "num_tokens": 243212418.0, + "step": 9604 + }, + { + "epoch": 1.054799033604217, + "grad_norm": 2.5840468406677246, + "learning_rate": 1e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.7120412588119507, + "num_tokens": 243234237.0, + "step": 9605 + }, + { + "epoch": 1.0549088513068308, + "grad_norm": 2.476177453994751, + "learning_rate": 1e-06, + "loss": 0.845, + "mean_token_accuracy": 0.728770911693573, + "num_tokens": 243256014.0, + "step": 9606 + }, + { + "epoch": 1.0550186690094443, + "grad_norm": 2.307063579559326, + "learning_rate": 1e-06, + "loss": 0.8646, + "mean_token_accuracy": 0.7331777811050415, + "num_tokens": 243281162.0, + "step": 9607 + }, + { + "epoch": 1.055128486712058, + "grad_norm": 2.150242567062378, + "learning_rate": 1e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.6999077796936035, + "num_tokens": 243310658.0, + "step": 9608 + }, + { + "epoch": 1.0552383044146716, + "grad_norm": 2.0799214839935303, + "learning_rate": 1e-06, + "loss": 0.8689, + "mean_token_accuracy": 0.739378035068512, + "num_tokens": 243339615.0, + "step": 9609 + }, + { + "epoch": 1.0553481221172853, + "grad_norm": 2.406352996826172, + "learning_rate": 1e-06, + "loss": 0.8349, + "mean_token_accuracy": 0.7334212064743042, + "num_tokens": 243360618.0, + "step": 9610 + }, + { + "epoch": 1.0554579398198989, + "grad_norm": 2.2748537063598633, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7176831364631653, + "num_tokens": 243386601.0, + "step": 9611 + }, + { + "epoch": 1.0555677575225126, + "grad_norm": 2.3630266189575195, + "learning_rate": 1e-06, + "loss": 0.9213, + "mean_token_accuracy": 0.7142001390457153, + "num_tokens": 243409581.0, + "step": 9612 + }, + { + "epoch": 1.0556775752251264, + "grad_norm": 2.2157113552093506, + "learning_rate": 1e-06, + "loss": 0.8218, + "mean_token_accuracy": 0.740280270576477, + "num_tokens": 243434436.0, + "step": 9613 + }, + { + "epoch": 1.05578739292774, + "grad_norm": 2.17610502243042, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.6952239871025085, + "num_tokens": 243462228.0, + "step": 9614 + }, + { + "epoch": 1.0558972106303537, + "grad_norm": 2.6260087490081787, + "learning_rate": 1e-06, + "loss": 0.9054, + "mean_token_accuracy": 0.7125108242034912, + "num_tokens": 243480612.0, + "step": 9615 + }, + { + "epoch": 1.0560070283329672, + "grad_norm": 2.1452176570892334, + "learning_rate": 1e-06, + "loss": 0.9066, + "mean_token_accuracy": 0.7310322523117065, + "num_tokens": 243509787.0, + "step": 9616 + }, + { + "epoch": 1.056116846035581, + "grad_norm": 2.1636595726013184, + "learning_rate": 1e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.713860034942627, + "num_tokens": 243538590.0, + "step": 9617 + }, + { + "epoch": 1.0562266637381945, + "grad_norm": 2.118964433670044, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.7064484357833862, + "num_tokens": 243568028.0, + "step": 9618 + }, + { + "epoch": 1.0563364814408083, + "grad_norm": 2.2543444633483887, + "learning_rate": 1e-06, + "loss": 0.8949, + "mean_token_accuracy": 0.7203438878059387, + "num_tokens": 243590999.0, + "step": 9619 + }, + { + "epoch": 1.0564462991434218, + "grad_norm": 1.9813857078552246, + "learning_rate": 1e-06, + "loss": 1.0099, + "mean_token_accuracy": 0.6924106478691101, + "num_tokens": 243622174.0, + "step": 9620 + }, + { + "epoch": 1.0565561168460356, + "grad_norm": 2.2538671493530273, + "learning_rate": 1e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.7115957736968994, + "num_tokens": 243647374.0, + "step": 9621 + }, + { + "epoch": 1.0566659345486493, + "grad_norm": 2.1691598892211914, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7116565704345703, + "num_tokens": 243675088.0, + "step": 9622 + }, + { + "epoch": 1.0567757522512629, + "grad_norm": 2.295785665512085, + "learning_rate": 1e-06, + "loss": 0.8439, + "mean_token_accuracy": 0.7313007116317749, + "num_tokens": 243697831.0, + "step": 9623 + }, + { + "epoch": 1.0568855699538766, + "grad_norm": 2.487388849258423, + "learning_rate": 1e-06, + "loss": 0.8948, + "mean_token_accuracy": 0.7240955829620361, + "num_tokens": 243720397.0, + "step": 9624 + }, + { + "epoch": 1.0569953876564901, + "grad_norm": 2.006464719772339, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.707242488861084, + "num_tokens": 243751048.0, + "step": 9625 + }, + { + "epoch": 1.057105205359104, + "grad_norm": 2.319613456726074, + "learning_rate": 1e-06, + "loss": 0.8789, + "mean_token_accuracy": 0.7235449552536011, + "num_tokens": 243775079.0, + "step": 9626 + }, + { + "epoch": 1.0572150230617177, + "grad_norm": 2.2517623901367188, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7123011350631714, + "num_tokens": 243801343.0, + "step": 9627 + }, + { + "epoch": 1.0573248407643312, + "grad_norm": 2.453779458999634, + "learning_rate": 1e-06, + "loss": 0.814, + "mean_token_accuracy": 0.7379760146141052, + "num_tokens": 243823908.0, + "step": 9628 + }, + { + "epoch": 1.057434658466945, + "grad_norm": 2.45992374420166, + "learning_rate": 1e-06, + "loss": 0.849, + "mean_token_accuracy": 0.7425811886787415, + "num_tokens": 243845371.0, + "step": 9629 + }, + { + "epoch": 1.0575444761695585, + "grad_norm": 2.066030740737915, + "learning_rate": 1e-06, + "loss": 0.8226, + "mean_token_accuracy": 0.7416359186172485, + "num_tokens": 243872273.0, + "step": 9630 + }, + { + "epoch": 1.0576542938721722, + "grad_norm": 2.5770936012268066, + "learning_rate": 1e-06, + "loss": 0.8269, + "mean_token_accuracy": 0.73548424243927, + "num_tokens": 243890932.0, + "step": 9631 + }, + { + "epoch": 1.0577641115747858, + "grad_norm": 2.27028489112854, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7096947431564331, + "num_tokens": 243915587.0, + "step": 9632 + }, + { + "epoch": 1.0578739292773995, + "grad_norm": 2.4361276626586914, + "learning_rate": 1e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.708625316619873, + "num_tokens": 243938888.0, + "step": 9633 + }, + { + "epoch": 1.057983746980013, + "grad_norm": 2.2432610988616943, + "learning_rate": 1e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.6991441249847412, + "num_tokens": 243965909.0, + "step": 9634 + }, + { + "epoch": 1.0580935646826268, + "grad_norm": 2.336516857147217, + "learning_rate": 1e-06, + "loss": 0.8082, + "mean_token_accuracy": 0.7501140832901001, + "num_tokens": 243988001.0, + "step": 9635 + }, + { + "epoch": 1.0582033823852406, + "grad_norm": 2.314743757247925, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.6986105442047119, + "num_tokens": 244012442.0, + "step": 9636 + }, + { + "epoch": 1.0583132000878541, + "grad_norm": 2.1493709087371826, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7118008136749268, + "num_tokens": 244039016.0, + "step": 9637 + }, + { + "epoch": 1.0584230177904679, + "grad_norm": 2.275397300720215, + "learning_rate": 1e-06, + "loss": 0.8866, + "mean_token_accuracy": 0.7243024110794067, + "num_tokens": 244062289.0, + "step": 9638 + }, + { + "epoch": 1.0585328354930814, + "grad_norm": 2.326638698577881, + "learning_rate": 1e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.7110945582389832, + "num_tokens": 244087511.0, + "step": 9639 + }, + { + "epoch": 1.0586426531956952, + "grad_norm": 2.015742063522339, + "learning_rate": 1e-06, + "loss": 0.9869, + "mean_token_accuracy": 0.7059274315834045, + "num_tokens": 244118731.0, + "step": 9640 + }, + { + "epoch": 1.0587524708983087, + "grad_norm": 2.409648895263672, + "learning_rate": 1e-06, + "loss": 0.859, + "mean_token_accuracy": 0.7395175695419312, + "num_tokens": 244140697.0, + "step": 9641 + }, + { + "epoch": 1.0588622886009225, + "grad_norm": 2.254984140396118, + "learning_rate": 1e-06, + "loss": 0.8795, + "mean_token_accuracy": 0.723631739616394, + "num_tokens": 244166117.0, + "step": 9642 + }, + { + "epoch": 1.0589721063035362, + "grad_norm": 2.4746148586273193, + "learning_rate": 1e-06, + "loss": 0.8367, + "mean_token_accuracy": 0.7415978908538818, + "num_tokens": 244188708.0, + "step": 9643 + }, + { + "epoch": 1.0590819240061498, + "grad_norm": 2.63031268119812, + "learning_rate": 1e-06, + "loss": 0.8375, + "mean_token_accuracy": 0.7363269925117493, + "num_tokens": 244209123.0, + "step": 9644 + }, + { + "epoch": 1.0591917417087635, + "grad_norm": 2.2274062633514404, + "learning_rate": 1e-06, + "loss": 0.8558, + "mean_token_accuracy": 0.7296945452690125, + "num_tokens": 244235283.0, + "step": 9645 + }, + { + "epoch": 1.059301559411377, + "grad_norm": 2.2905099391937256, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.7008475661277771, + "num_tokens": 244261768.0, + "step": 9646 + }, + { + "epoch": 1.0594113771139908, + "grad_norm": 2.3157434463500977, + "learning_rate": 1e-06, + "loss": 0.9127, + "mean_token_accuracy": 0.7207363843917847, + "num_tokens": 244286247.0, + "step": 9647 + }, + { + "epoch": 1.0595211948166043, + "grad_norm": 2.1755852699279785, + "learning_rate": 1e-06, + "loss": 0.8924, + "mean_token_accuracy": 0.7254062294960022, + "num_tokens": 244311766.0, + "step": 9648 + }, + { + "epoch": 1.059631012519218, + "grad_norm": 2.177393913269043, + "learning_rate": 1e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.6957602500915527, + "num_tokens": 244340256.0, + "step": 9649 + }, + { + "epoch": 1.0597408302218319, + "grad_norm": 2.5723724365234375, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7077297568321228, + "num_tokens": 244360594.0, + "step": 9650 + }, + { + "epoch": 1.0598506479244454, + "grad_norm": 2.0511064529418945, + "learning_rate": 1e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.698551595211029, + "num_tokens": 244392539.0, + "step": 9651 + }, + { + "epoch": 1.0599604656270591, + "grad_norm": 2.288501262664795, + "learning_rate": 1e-06, + "loss": 0.88, + "mean_token_accuracy": 0.7207081317901611, + "num_tokens": 244418450.0, + "step": 9652 + }, + { + "epoch": 1.0600702833296727, + "grad_norm": 1.939080834388733, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.6996638774871826, + "num_tokens": 244451211.0, + "step": 9653 + }, + { + "epoch": 1.0601801010322864, + "grad_norm": 2.3569681644439697, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.7150843143463135, + "num_tokens": 244474626.0, + "step": 9654 + }, + { + "epoch": 1.0602899187349, + "grad_norm": 2.4101040363311768, + "learning_rate": 1e-06, + "loss": 0.9506, + "mean_token_accuracy": 0.7100794911384583, + "num_tokens": 244497709.0, + "step": 9655 + }, + { + "epoch": 1.0603997364375137, + "grad_norm": 2.4698402881622314, + "learning_rate": 1e-06, + "loss": 0.9055, + "mean_token_accuracy": 0.7125085592269897, + "num_tokens": 244521297.0, + "step": 9656 + }, + { + "epoch": 1.0605095541401275, + "grad_norm": 2.1432554721832275, + "learning_rate": 1e-06, + "loss": 0.9183, + "mean_token_accuracy": 0.719881534576416, + "num_tokens": 244548749.0, + "step": 9657 + }, + { + "epoch": 1.060619371842741, + "grad_norm": 2.114166259765625, + "learning_rate": 1e-06, + "loss": 0.9635, + "mean_token_accuracy": 0.7105441093444824, + "num_tokens": 244575752.0, + "step": 9658 + }, + { + "epoch": 1.0607291895453548, + "grad_norm": 2.713038921356201, + "learning_rate": 1e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.7258991003036499, + "num_tokens": 244594988.0, + "step": 9659 + }, + { + "epoch": 1.0608390072479683, + "grad_norm": 2.190037488937378, + "learning_rate": 1e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.7034067511558533, + "num_tokens": 244623560.0, + "step": 9660 + }, + { + "epoch": 1.060948824950582, + "grad_norm": 2.47277569770813, + "learning_rate": 1e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.7283715605735779, + "num_tokens": 244644323.0, + "step": 9661 + }, + { + "epoch": 1.0610586426531956, + "grad_norm": 2.059171199798584, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.699694037437439, + "num_tokens": 244674230.0, + "step": 9662 + }, + { + "epoch": 1.0611684603558094, + "grad_norm": 2.224076509475708, + "learning_rate": 1e-06, + "loss": 0.8511, + "mean_token_accuracy": 0.732255220413208, + "num_tokens": 244697288.0, + "step": 9663 + }, + { + "epoch": 1.0612782780584231, + "grad_norm": 2.2497408390045166, + "learning_rate": 1e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7288048267364502, + "num_tokens": 244721681.0, + "step": 9664 + }, + { + "epoch": 1.0613880957610367, + "grad_norm": 2.006502389907837, + "learning_rate": 1e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.7101528644561768, + "num_tokens": 244753553.0, + "step": 9665 + }, + { + "epoch": 1.0614979134636504, + "grad_norm": 2.473863363265991, + "learning_rate": 1e-06, + "loss": 0.9179, + "mean_token_accuracy": 0.7147490978240967, + "num_tokens": 244777067.0, + "step": 9666 + }, + { + "epoch": 1.061607731166264, + "grad_norm": 2.142820358276367, + "learning_rate": 1e-06, + "loss": 0.8349, + "mean_token_accuracy": 0.7329702377319336, + "num_tokens": 244804345.0, + "step": 9667 + }, + { + "epoch": 1.0617175488688777, + "grad_norm": 2.2053933143615723, + "learning_rate": 1e-06, + "loss": 0.9219, + "mean_token_accuracy": 0.71033775806427, + "num_tokens": 244831229.0, + "step": 9668 + }, + { + "epoch": 1.0618273665714912, + "grad_norm": 2.301964044570923, + "learning_rate": 1e-06, + "loss": 0.863, + "mean_token_accuracy": 0.7254809737205505, + "num_tokens": 244853927.0, + "step": 9669 + }, + { + "epoch": 1.061937184274105, + "grad_norm": 2.3471930027008057, + "learning_rate": 1e-06, + "loss": 0.8793, + "mean_token_accuracy": 0.7214314937591553, + "num_tokens": 244876910.0, + "step": 9670 + }, + { + "epoch": 1.0620470019767188, + "grad_norm": 2.2772326469421387, + "learning_rate": 1e-06, + "loss": 0.8108, + "mean_token_accuracy": 0.7395801544189453, + "num_tokens": 244899500.0, + "step": 9671 + }, + { + "epoch": 1.0621568196793323, + "grad_norm": 2.2180888652801514, + "learning_rate": 1e-06, + "loss": 0.8587, + "mean_token_accuracy": 0.7304897308349609, + "num_tokens": 244925900.0, + "step": 9672 + }, + { + "epoch": 1.062266637381946, + "grad_norm": 2.0442147254943848, + "learning_rate": 1e-06, + "loss": 0.9407, + "mean_token_accuracy": 0.7170448303222656, + "num_tokens": 244954076.0, + "step": 9673 + }, + { + "epoch": 1.0623764550845596, + "grad_norm": 2.2146308422088623, + "learning_rate": 1e-06, + "loss": 1.0095, + "mean_token_accuracy": 0.6929588317871094, + "num_tokens": 244984875.0, + "step": 9674 + }, + { + "epoch": 1.0624862727871733, + "grad_norm": 2.619624376296997, + "learning_rate": 1e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.7190617322921753, + "num_tokens": 245006173.0, + "step": 9675 + }, + { + "epoch": 1.0625960904897869, + "grad_norm": 2.5213851928710938, + "learning_rate": 1e-06, + "loss": 0.8291, + "mean_token_accuracy": 0.7339317798614502, + "num_tokens": 245027672.0, + "step": 9676 + }, + { + "epoch": 1.0627059081924006, + "grad_norm": 2.3523917198181152, + "learning_rate": 1e-06, + "loss": 0.9107, + "mean_token_accuracy": 0.7124422192573547, + "num_tokens": 245051538.0, + "step": 9677 + }, + { + "epoch": 1.0628157258950144, + "grad_norm": 2.235255241394043, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7095957398414612, + "num_tokens": 245076733.0, + "step": 9678 + }, + { + "epoch": 1.062925543597628, + "grad_norm": 1.8914377689361572, + "learning_rate": 1e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.7134417295455933, + "num_tokens": 245111609.0, + "step": 9679 + }, + { + "epoch": 1.0630353613002417, + "grad_norm": 2.548029661178589, + "learning_rate": 1e-06, + "loss": 0.8333, + "mean_token_accuracy": 0.7379840016365051, + "num_tokens": 245133045.0, + "step": 9680 + }, + { + "epoch": 1.0631451790028552, + "grad_norm": 2.5273685455322266, + "learning_rate": 1e-06, + "loss": 0.8952, + "mean_token_accuracy": 0.7174447178840637, + "num_tokens": 245155280.0, + "step": 9681 + }, + { + "epoch": 1.063254996705469, + "grad_norm": 2.274420738220215, + "learning_rate": 1e-06, + "loss": 1.0348, + "mean_token_accuracy": 0.6880732774734497, + "num_tokens": 245182915.0, + "step": 9682 + }, + { + "epoch": 1.0633648144080825, + "grad_norm": 2.152688980102539, + "learning_rate": 1e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.7083250284194946, + "num_tokens": 245212424.0, + "step": 9683 + }, + { + "epoch": 1.0634746321106963, + "grad_norm": 2.075496196746826, + "learning_rate": 1e-06, + "loss": 0.9587, + "mean_token_accuracy": 0.7019826769828796, + "num_tokens": 245240966.0, + "step": 9684 + }, + { + "epoch": 1.0635844498133098, + "grad_norm": 2.343944549560547, + "learning_rate": 1e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.7186077237129211, + "num_tokens": 245265154.0, + "step": 9685 + }, + { + "epoch": 1.0636942675159236, + "grad_norm": 2.6948535442352295, + "learning_rate": 1e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7272002696990967, + "num_tokens": 245285246.0, + "step": 9686 + }, + { + "epoch": 1.0638040852185373, + "grad_norm": 2.163536310195923, + "learning_rate": 1e-06, + "loss": 1.0339, + "mean_token_accuracy": 0.6834325790405273, + "num_tokens": 245313503.0, + "step": 9687 + }, + { + "epoch": 1.0639139029211508, + "grad_norm": 2.367644786834717, + "learning_rate": 1e-06, + "loss": 0.9084, + "mean_token_accuracy": 0.7225857973098755, + "num_tokens": 245338139.0, + "step": 9688 + }, + { + "epoch": 1.0640237206237646, + "grad_norm": 2.095052719116211, + "learning_rate": 1e-06, + "loss": 0.8434, + "mean_token_accuracy": 0.7346956729888916, + "num_tokens": 245367165.0, + "step": 9689 + }, + { + "epoch": 1.0641335383263781, + "grad_norm": 2.573234796524048, + "learning_rate": 1e-06, + "loss": 0.8635, + "mean_token_accuracy": 0.7362085580825806, + "num_tokens": 245386880.0, + "step": 9690 + }, + { + "epoch": 1.064243356028992, + "grad_norm": 2.336196184158325, + "learning_rate": 1e-06, + "loss": 0.8317, + "mean_token_accuracy": 0.732316792011261, + "num_tokens": 245408477.0, + "step": 9691 + }, + { + "epoch": 1.0643531737316057, + "grad_norm": 2.5123133659362793, + "learning_rate": 1e-06, + "loss": 0.8313, + "mean_token_accuracy": 0.7317503690719604, + "num_tokens": 245428550.0, + "step": 9692 + }, + { + "epoch": 1.0644629914342192, + "grad_norm": 2.454012393951416, + "learning_rate": 1e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.7045789361000061, + "num_tokens": 245451612.0, + "step": 9693 + }, + { + "epoch": 1.064572809136833, + "grad_norm": 2.1419787406921387, + "learning_rate": 1e-06, + "loss": 0.9146, + "mean_token_accuracy": 0.7207030057907104, + "num_tokens": 245480767.0, + "step": 9694 + }, + { + "epoch": 1.0646826268394465, + "grad_norm": 2.651624917984009, + "learning_rate": 1e-06, + "loss": 0.8071, + "mean_token_accuracy": 0.7448755502700806, + "num_tokens": 245498882.0, + "step": 9695 + }, + { + "epoch": 1.0647924445420602, + "grad_norm": 2.4890613555908203, + "learning_rate": 1e-06, + "loss": 0.8542, + "mean_token_accuracy": 0.7281209230422974, + "num_tokens": 245518997.0, + "step": 9696 + }, + { + "epoch": 1.0649022622446738, + "grad_norm": 2.496299982070923, + "learning_rate": 1e-06, + "loss": 0.883, + "mean_token_accuracy": 0.7235839366912842, + "num_tokens": 245544461.0, + "step": 9697 + }, + { + "epoch": 1.0650120799472875, + "grad_norm": 2.677661418914795, + "learning_rate": 1e-06, + "loss": 0.905, + "mean_token_accuracy": 0.7175291776657104, + "num_tokens": 245563887.0, + "step": 9698 + }, + { + "epoch": 1.065121897649901, + "grad_norm": 2.5958447456359863, + "learning_rate": 1e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.7169051170349121, + "num_tokens": 245583555.0, + "step": 9699 + }, + { + "epoch": 1.0652317153525148, + "grad_norm": 2.2054316997528076, + "learning_rate": 1e-06, + "loss": 0.8975, + "mean_token_accuracy": 0.7316116690635681, + "num_tokens": 245609992.0, + "step": 9700 + }, + { + "epoch": 1.0653415330551286, + "grad_norm": 2.0886197090148926, + "learning_rate": 1e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7015039920806885, + "num_tokens": 245639009.0, + "step": 9701 + }, + { + "epoch": 1.065451350757742, + "grad_norm": 2.0041089057922363, + "learning_rate": 1e-06, + "loss": 0.8225, + "mean_token_accuracy": 0.7358137369155884, + "num_tokens": 245668984.0, + "step": 9702 + }, + { + "epoch": 1.0655611684603559, + "grad_norm": 2.103055477142334, + "learning_rate": 1e-06, + "loss": 0.7854, + "mean_token_accuracy": 0.7455844879150391, + "num_tokens": 245694739.0, + "step": 9703 + }, + { + "epoch": 1.0656709861629694, + "grad_norm": 1.9707143306732178, + "learning_rate": 1e-06, + "loss": 0.8318, + "mean_token_accuracy": 0.7341393232345581, + "num_tokens": 245725273.0, + "step": 9704 + }, + { + "epoch": 1.0657808038655832, + "grad_norm": 2.562441349029541, + "learning_rate": 1e-06, + "loss": 0.8151, + "mean_token_accuracy": 0.7347482442855835, + "num_tokens": 245745242.0, + "step": 9705 + }, + { + "epoch": 1.0658906215681967, + "grad_norm": 2.1396682262420654, + "learning_rate": 1e-06, + "loss": 0.8633, + "mean_token_accuracy": 0.7402398586273193, + "num_tokens": 245772192.0, + "step": 9706 + }, + { + "epoch": 1.0660004392708105, + "grad_norm": 2.726304769515991, + "learning_rate": 1e-06, + "loss": 0.8394, + "mean_token_accuracy": 0.7357190847396851, + "num_tokens": 245790197.0, + "step": 9707 + }, + { + "epoch": 1.0661102569734242, + "grad_norm": 1.8973013162612915, + "learning_rate": 1e-06, + "loss": 1.0161, + "mean_token_accuracy": 0.6886294484138489, + "num_tokens": 245827885.0, + "step": 9708 + }, + { + "epoch": 1.0662200746760377, + "grad_norm": 1.996599793434143, + "learning_rate": 1e-06, + "loss": 0.8996, + "mean_token_accuracy": 0.7178533673286438, + "num_tokens": 245859162.0, + "step": 9709 + }, + { + "epoch": 1.0663298923786515, + "grad_norm": 2.390660047531128, + "learning_rate": 1e-06, + "loss": 0.9075, + "mean_token_accuracy": 0.716309666633606, + "num_tokens": 245881288.0, + "step": 9710 + }, + { + "epoch": 1.066439710081265, + "grad_norm": 2.581974983215332, + "learning_rate": 1e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.7138036489486694, + "num_tokens": 245903124.0, + "step": 9711 + }, + { + "epoch": 1.0665495277838788, + "grad_norm": 2.0549609661102295, + "learning_rate": 1e-06, + "loss": 0.86, + "mean_token_accuracy": 0.7296964526176453, + "num_tokens": 245929850.0, + "step": 9712 + }, + { + "epoch": 1.0666593454864923, + "grad_norm": 2.22894287109375, + "learning_rate": 1e-06, + "loss": 0.8805, + "mean_token_accuracy": 0.72822105884552, + "num_tokens": 245954432.0, + "step": 9713 + }, + { + "epoch": 1.066769163189106, + "grad_norm": 2.4872615337371826, + "learning_rate": 1e-06, + "loss": 0.8782, + "mean_token_accuracy": 0.7212158441543579, + "num_tokens": 245975907.0, + "step": 9714 + }, + { + "epoch": 1.0668789808917198, + "grad_norm": 2.474536180496216, + "learning_rate": 1e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.7021300792694092, + "num_tokens": 245999566.0, + "step": 9715 + }, + { + "epoch": 1.0669887985943334, + "grad_norm": 2.0407540798187256, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7125095129013062, + "num_tokens": 246029106.0, + "step": 9716 + }, + { + "epoch": 1.0670986162969471, + "grad_norm": 2.3192763328552246, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7143951654434204, + "num_tokens": 246053991.0, + "step": 9717 + }, + { + "epoch": 1.0672084339995607, + "grad_norm": 1.9927420616149902, + "learning_rate": 1e-06, + "loss": 0.9409, + "mean_token_accuracy": 0.707253098487854, + "num_tokens": 246084020.0, + "step": 9718 + }, + { + "epoch": 1.0673182517021744, + "grad_norm": 2.40405535697937, + "learning_rate": 1e-06, + "loss": 0.8684, + "mean_token_accuracy": 0.7255609035491943, + "num_tokens": 246105673.0, + "step": 9719 + }, + { + "epoch": 1.067428069404788, + "grad_norm": 2.2267377376556396, + "learning_rate": 1e-06, + "loss": 0.9506, + "mean_token_accuracy": 0.704852819442749, + "num_tokens": 246130662.0, + "step": 9720 + }, + { + "epoch": 1.0675378871074017, + "grad_norm": 2.3223159313201904, + "learning_rate": 1e-06, + "loss": 0.8952, + "mean_token_accuracy": 0.7178411483764648, + "num_tokens": 246154677.0, + "step": 9721 + }, + { + "epoch": 1.0676477048100155, + "grad_norm": 2.5224485397338867, + "learning_rate": 1e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.7156035900115967, + "num_tokens": 246177308.0, + "step": 9722 + }, + { + "epoch": 1.067757522512629, + "grad_norm": 2.483889102935791, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.707895040512085, + "num_tokens": 246200152.0, + "step": 9723 + }, + { + "epoch": 1.0678673402152428, + "grad_norm": 2.345350742340088, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7119905948638916, + "num_tokens": 246224124.0, + "step": 9724 + }, + { + "epoch": 1.0679771579178563, + "grad_norm": 2.000906229019165, + "learning_rate": 1e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.7032807469367981, + "num_tokens": 246253044.0, + "step": 9725 + }, + { + "epoch": 1.06808697562047, + "grad_norm": 2.3719875812530518, + "learning_rate": 1e-06, + "loss": 0.8501, + "mean_token_accuracy": 0.7288657426834106, + "num_tokens": 246276401.0, + "step": 9726 + }, + { + "epoch": 1.0681967933230836, + "grad_norm": 2.2255208492279053, + "learning_rate": 1e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.71631920337677, + "num_tokens": 246303589.0, + "step": 9727 + }, + { + "epoch": 1.0683066110256974, + "grad_norm": 2.2070794105529785, + "learning_rate": 1e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7212094664573669, + "num_tokens": 246330067.0, + "step": 9728 + }, + { + "epoch": 1.068416428728311, + "grad_norm": 2.1230533123016357, + "learning_rate": 1e-06, + "loss": 0.9768, + "mean_token_accuracy": 0.7045792937278748, + "num_tokens": 246357872.0, + "step": 9729 + }, + { + "epoch": 1.0685262464309246, + "grad_norm": 2.384610414505005, + "learning_rate": 1e-06, + "loss": 0.8747, + "mean_token_accuracy": 0.7301379442214966, + "num_tokens": 246380021.0, + "step": 9730 + }, + { + "epoch": 1.0686360641335384, + "grad_norm": 2.2242777347564697, + "learning_rate": 1e-06, + "loss": 0.8707, + "mean_token_accuracy": 0.7296016216278076, + "num_tokens": 246403913.0, + "step": 9731 + }, + { + "epoch": 1.068745881836152, + "grad_norm": 2.461785078048706, + "learning_rate": 1e-06, + "loss": 0.8834, + "mean_token_accuracy": 0.7197306752204895, + "num_tokens": 246426617.0, + "step": 9732 + }, + { + "epoch": 1.0688556995387657, + "grad_norm": 2.34486985206604, + "learning_rate": 1e-06, + "loss": 0.9804, + "mean_token_accuracy": 0.7006558775901794, + "num_tokens": 246452518.0, + "step": 9733 + }, + { + "epoch": 1.0689655172413792, + "grad_norm": 2.334993362426758, + "learning_rate": 1e-06, + "loss": 0.8857, + "mean_token_accuracy": 0.7275933623313904, + "num_tokens": 246476159.0, + "step": 9734 + }, + { + "epoch": 1.069075334943993, + "grad_norm": 2.3588099479675293, + "learning_rate": 1e-06, + "loss": 0.8934, + "mean_token_accuracy": 0.7241787910461426, + "num_tokens": 246500642.0, + "step": 9735 + }, + { + "epoch": 1.0691851526466065, + "grad_norm": 2.3679919242858887, + "learning_rate": 1e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.703815221786499, + "num_tokens": 246524503.0, + "step": 9736 + }, + { + "epoch": 1.0692949703492203, + "grad_norm": 2.1317436695098877, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7066640853881836, + "num_tokens": 246551593.0, + "step": 9737 + }, + { + "epoch": 1.069404788051834, + "grad_norm": 2.639943838119507, + "learning_rate": 1e-06, + "loss": 0.8839, + "mean_token_accuracy": 0.7238438129425049, + "num_tokens": 246570450.0, + "step": 9738 + }, + { + "epoch": 1.0695146057544476, + "grad_norm": 2.5225329399108887, + "learning_rate": 1e-06, + "loss": 0.8331, + "mean_token_accuracy": 0.7395113110542297, + "num_tokens": 246591086.0, + "step": 9739 + }, + { + "epoch": 1.0696244234570613, + "grad_norm": 2.4514319896698, + "learning_rate": 1e-06, + "loss": 0.8497, + "mean_token_accuracy": 0.728915810585022, + "num_tokens": 246615535.0, + "step": 9740 + }, + { + "epoch": 1.0697342411596749, + "grad_norm": 2.3239827156066895, + "learning_rate": 1e-06, + "loss": 0.9001, + "mean_token_accuracy": 0.7197881937026978, + "num_tokens": 246641827.0, + "step": 9741 + }, + { + "epoch": 1.0698440588622886, + "grad_norm": 2.1632909774780273, + "learning_rate": 1e-06, + "loss": 0.918, + "mean_token_accuracy": 0.7179214954376221, + "num_tokens": 246669938.0, + "step": 9742 + }, + { + "epoch": 1.0699538765649024, + "grad_norm": 2.0771777629852295, + "learning_rate": 1e-06, + "loss": 0.8716, + "mean_token_accuracy": 0.7228779196739197, + "num_tokens": 246698246.0, + "step": 9743 + }, + { + "epoch": 1.070063694267516, + "grad_norm": 1.9928053617477417, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.7078403830528259, + "num_tokens": 246727865.0, + "step": 9744 + }, + { + "epoch": 1.0701735119701297, + "grad_norm": 2.382657289505005, + "learning_rate": 1e-06, + "loss": 0.9066, + "mean_token_accuracy": 0.7125927209854126, + "num_tokens": 246749510.0, + "step": 9745 + }, + { + "epoch": 1.0702833296727432, + "grad_norm": 2.3172965049743652, + "learning_rate": 1e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.6979836821556091, + "num_tokens": 246774986.0, + "step": 9746 + }, + { + "epoch": 1.070393147375357, + "grad_norm": 2.303925037384033, + "learning_rate": 1e-06, + "loss": 0.898, + "mean_token_accuracy": 0.7213665843009949, + "num_tokens": 246798546.0, + "step": 9747 + }, + { + "epoch": 1.0705029650779705, + "grad_norm": 2.1830825805664062, + "learning_rate": 1e-06, + "loss": 0.8532, + "mean_token_accuracy": 0.7343078851699829, + "num_tokens": 246823494.0, + "step": 9748 + }, + { + "epoch": 1.0706127827805842, + "grad_norm": 2.3749847412109375, + "learning_rate": 1e-06, + "loss": 0.8378, + "mean_token_accuracy": 0.7360869646072388, + "num_tokens": 246846343.0, + "step": 9749 + }, + { + "epoch": 1.0707226004831978, + "grad_norm": 2.4032175540924072, + "learning_rate": 1e-06, + "loss": 0.8187, + "mean_token_accuracy": 0.7395116090774536, + "num_tokens": 246866451.0, + "step": 9750 + }, + { + "epoch": 1.0708324181858115, + "grad_norm": 2.3618483543395996, + "learning_rate": 1e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.7253016233444214, + "num_tokens": 246893903.0, + "step": 9751 + }, + { + "epoch": 1.0709422358884253, + "grad_norm": 2.7077293395996094, + "learning_rate": 1e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.7054969668388367, + "num_tokens": 246912748.0, + "step": 9752 + }, + { + "epoch": 1.0710520535910388, + "grad_norm": 1.9008175134658813, + "learning_rate": 1e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.7073885202407837, + "num_tokens": 246949213.0, + "step": 9753 + }, + { + "epoch": 1.0711618712936526, + "grad_norm": 2.331925630569458, + "learning_rate": 1e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7130215764045715, + "num_tokens": 246972088.0, + "step": 9754 + }, + { + "epoch": 1.0712716889962661, + "grad_norm": 2.26011323928833, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.704991340637207, + "num_tokens": 246997708.0, + "step": 9755 + }, + { + "epoch": 1.0713815066988799, + "grad_norm": 2.4793481826782227, + "learning_rate": 1e-06, + "loss": 0.8517, + "mean_token_accuracy": 0.7398920059204102, + "num_tokens": 247017560.0, + "step": 9756 + }, + { + "epoch": 1.0714913244014934, + "grad_norm": 2.11769700050354, + "learning_rate": 1e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.7059504985809326, + "num_tokens": 247047623.0, + "step": 9757 + }, + { + "epoch": 1.0716011421041072, + "grad_norm": 2.5212721824645996, + "learning_rate": 1e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.7275484800338745, + "num_tokens": 247067236.0, + "step": 9758 + }, + { + "epoch": 1.071710959806721, + "grad_norm": 2.3057093620300293, + "learning_rate": 1e-06, + "loss": 0.875, + "mean_token_accuracy": 0.7245392203330994, + "num_tokens": 247090075.0, + "step": 9759 + }, + { + "epoch": 1.0718207775093345, + "grad_norm": 1.9761449098587036, + "learning_rate": 1e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7128841280937195, + "num_tokens": 247119577.0, + "step": 9760 + }, + { + "epoch": 1.0719305952119482, + "grad_norm": 2.2886903285980225, + "learning_rate": 1e-06, + "loss": 0.8286, + "mean_token_accuracy": 0.7400200366973877, + "num_tokens": 247142375.0, + "step": 9761 + }, + { + "epoch": 1.0720404129145618, + "grad_norm": 2.474687337875366, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7204536199569702, + "num_tokens": 247165691.0, + "step": 9762 + }, + { + "epoch": 1.0721502306171755, + "grad_norm": 2.197514533996582, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.701940655708313, + "num_tokens": 247191648.0, + "step": 9763 + }, + { + "epoch": 1.072260048319789, + "grad_norm": 2.4421565532684326, + "learning_rate": 1e-06, + "loss": 0.8204, + "mean_token_accuracy": 0.7386313676834106, + "num_tokens": 247211620.0, + "step": 9764 + }, + { + "epoch": 1.0723698660224028, + "grad_norm": 2.5636231899261475, + "learning_rate": 1e-06, + "loss": 0.8775, + "mean_token_accuracy": 0.7224279642105103, + "num_tokens": 247231276.0, + "step": 9765 + }, + { + "epoch": 1.0724796837250166, + "grad_norm": 2.0914840698242188, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7146987915039062, + "num_tokens": 247260755.0, + "step": 9766 + }, + { + "epoch": 1.07258950142763, + "grad_norm": 2.207505226135254, + "learning_rate": 1e-06, + "loss": 0.9673, + "mean_token_accuracy": 0.7021432518959045, + "num_tokens": 247286599.0, + "step": 9767 + }, + { + "epoch": 1.0726993191302439, + "grad_norm": 2.1265788078308105, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.7022549510002136, + "num_tokens": 247314642.0, + "step": 9768 + }, + { + "epoch": 1.0728091368328574, + "grad_norm": 2.242382526397705, + "learning_rate": 1e-06, + "loss": 0.993, + "mean_token_accuracy": 0.7022821307182312, + "num_tokens": 247341904.0, + "step": 9769 + }, + { + "epoch": 1.0729189545354711, + "grad_norm": 2.3779091835021973, + "learning_rate": 1e-06, + "loss": 0.8238, + "mean_token_accuracy": 0.7469266653060913, + "num_tokens": 247362160.0, + "step": 9770 + }, + { + "epoch": 1.0730287722380847, + "grad_norm": 2.0690114498138428, + "learning_rate": 1e-06, + "loss": 0.9192, + "mean_token_accuracy": 0.7133904695510864, + "num_tokens": 247391041.0, + "step": 9771 + }, + { + "epoch": 1.0731385899406984, + "grad_norm": 2.532562017440796, + "learning_rate": 1e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.6981487274169922, + "num_tokens": 247412829.0, + "step": 9772 + }, + { + "epoch": 1.0732484076433122, + "grad_norm": 2.431785821914673, + "learning_rate": 1e-06, + "loss": 0.8733, + "mean_token_accuracy": 0.7356035113334656, + "num_tokens": 247437776.0, + "step": 9773 + }, + { + "epoch": 1.0733582253459257, + "grad_norm": 2.2923779487609863, + "learning_rate": 1e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7190236449241638, + "num_tokens": 247462180.0, + "step": 9774 + }, + { + "epoch": 1.0734680430485395, + "grad_norm": 2.213252067565918, + "learning_rate": 1e-06, + "loss": 0.8563, + "mean_token_accuracy": 0.7296212911605835, + "num_tokens": 247486368.0, + "step": 9775 + }, + { + "epoch": 1.073577860751153, + "grad_norm": 2.267086982727051, + "learning_rate": 1e-06, + "loss": 0.8803, + "mean_token_accuracy": 0.7194921970367432, + "num_tokens": 247511291.0, + "step": 9776 + }, + { + "epoch": 1.0736876784537668, + "grad_norm": 2.291776418685913, + "learning_rate": 1e-06, + "loss": 0.8268, + "mean_token_accuracy": 0.7386510968208313, + "num_tokens": 247535058.0, + "step": 9777 + }, + { + "epoch": 1.0737974961563803, + "grad_norm": 2.1904678344726562, + "learning_rate": 1e-06, + "loss": 0.8019, + "mean_token_accuracy": 0.7481321096420288, + "num_tokens": 247560521.0, + "step": 9778 + }, + { + "epoch": 1.073907313858994, + "grad_norm": 2.6805896759033203, + "learning_rate": 1e-06, + "loss": 0.7707, + "mean_token_accuracy": 0.7511378526687622, + "num_tokens": 247579105.0, + "step": 9779 + }, + { + "epoch": 1.0740171315616078, + "grad_norm": 1.9275457859039307, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.716014564037323, + "num_tokens": 247612113.0, + "step": 9780 + }, + { + "epoch": 1.0741269492642214, + "grad_norm": 2.0363409519195557, + "learning_rate": 1e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.7206192016601562, + "num_tokens": 247641554.0, + "step": 9781 + }, + { + "epoch": 1.0742367669668351, + "grad_norm": 2.11472487449646, + "learning_rate": 1e-06, + "loss": 0.8496, + "mean_token_accuracy": 0.7284348011016846, + "num_tokens": 247668987.0, + "step": 9782 + }, + { + "epoch": 1.0743465846694487, + "grad_norm": 2.552020788192749, + "learning_rate": 1e-06, + "loss": 0.874, + "mean_token_accuracy": 0.7233747839927673, + "num_tokens": 247688577.0, + "step": 9783 + }, + { + "epoch": 1.0744564023720624, + "grad_norm": 2.2125751972198486, + "learning_rate": 1e-06, + "loss": 1.0067, + "mean_token_accuracy": 0.6883101463317871, + "num_tokens": 247718065.0, + "step": 9784 + }, + { + "epoch": 1.074566220074676, + "grad_norm": 2.109147310256958, + "learning_rate": 1e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.7205498218536377, + "num_tokens": 247744849.0, + "step": 9785 + }, + { + "epoch": 1.0746760377772897, + "grad_norm": 2.571350336074829, + "learning_rate": 1e-06, + "loss": 0.968, + "mean_token_accuracy": 0.7073173522949219, + "num_tokens": 247765722.0, + "step": 9786 + }, + { + "epoch": 1.0747858554799035, + "grad_norm": 2.251467227935791, + "learning_rate": 1e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.7159534692764282, + "num_tokens": 247791934.0, + "step": 9787 + }, + { + "epoch": 1.074895673182517, + "grad_norm": 2.3549835681915283, + "learning_rate": 1e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.7055599689483643, + "num_tokens": 247818452.0, + "step": 9788 + }, + { + "epoch": 1.0750054908851308, + "grad_norm": 2.1781246662139893, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.70916748046875, + "num_tokens": 247846312.0, + "step": 9789 + }, + { + "epoch": 1.0751153085877443, + "grad_norm": 2.3828377723693848, + "learning_rate": 1e-06, + "loss": 0.908, + "mean_token_accuracy": 0.721647322177887, + "num_tokens": 247869326.0, + "step": 9790 + }, + { + "epoch": 1.075225126290358, + "grad_norm": 2.0418407917022705, + "learning_rate": 1e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.7110204696655273, + "num_tokens": 247900884.0, + "step": 9791 + }, + { + "epoch": 1.0753349439929716, + "grad_norm": 2.1344828605651855, + "learning_rate": 1e-06, + "loss": 0.9254, + "mean_token_accuracy": 0.7091770172119141, + "num_tokens": 247928539.0, + "step": 9792 + }, + { + "epoch": 1.0754447616955853, + "grad_norm": 2.023206949234009, + "learning_rate": 1e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.713280439376831, + "num_tokens": 247958046.0, + "step": 9793 + }, + { + "epoch": 1.075554579398199, + "grad_norm": 2.138580799102783, + "learning_rate": 1e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.7099120616912842, + "num_tokens": 247984088.0, + "step": 9794 + }, + { + "epoch": 1.0756643971008126, + "grad_norm": 2.3636460304260254, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7088270783424377, + "num_tokens": 248007825.0, + "step": 9795 + }, + { + "epoch": 1.0757742148034264, + "grad_norm": 2.330860137939453, + "learning_rate": 1e-06, + "loss": 0.9722, + "mean_token_accuracy": 0.70644211769104, + "num_tokens": 248033794.0, + "step": 9796 + }, + { + "epoch": 1.07588403250604, + "grad_norm": 2.474649667739868, + "learning_rate": 1e-06, + "loss": 0.8902, + "mean_token_accuracy": 0.7182175517082214, + "num_tokens": 248055477.0, + "step": 9797 + }, + { + "epoch": 1.0759938502086537, + "grad_norm": 2.135058641433716, + "learning_rate": 1e-06, + "loss": 0.8597, + "mean_token_accuracy": 0.7318428158760071, + "num_tokens": 248083568.0, + "step": 9798 + }, + { + "epoch": 1.0761036679112672, + "grad_norm": 2.348930835723877, + "learning_rate": 1e-06, + "loss": 0.9695, + "mean_token_accuracy": 0.6968196630477905, + "num_tokens": 248110473.0, + "step": 9799 + }, + { + "epoch": 1.076213485613881, + "grad_norm": 2.6001241207122803, + "learning_rate": 1e-06, + "loss": 0.8203, + "mean_token_accuracy": 0.7375259399414062, + "num_tokens": 248128967.0, + "step": 9800 + }, + { + "epoch": 1.0763233033164945, + "grad_norm": 2.3687775135040283, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.709439218044281, + "num_tokens": 248152534.0, + "step": 9801 + }, + { + "epoch": 1.0764331210191083, + "grad_norm": 2.227548837661743, + "learning_rate": 1e-06, + "loss": 0.8925, + "mean_token_accuracy": 0.7232202291488647, + "num_tokens": 248178495.0, + "step": 9802 + }, + { + "epoch": 1.076542938721722, + "grad_norm": 2.5338594913482666, + "learning_rate": 1e-06, + "loss": 0.8716, + "mean_token_accuracy": 0.737442135810852, + "num_tokens": 248199070.0, + "step": 9803 + }, + { + "epoch": 1.0766527564243356, + "grad_norm": 2.1310627460479736, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7108855247497559, + "num_tokens": 248228110.0, + "step": 9804 + }, + { + "epoch": 1.0767625741269493, + "grad_norm": 2.553022861480713, + "learning_rate": 1e-06, + "loss": 0.7943, + "mean_token_accuracy": 0.7472679615020752, + "num_tokens": 248247744.0, + "step": 9805 + }, + { + "epoch": 1.0768723918295628, + "grad_norm": 2.4344465732574463, + "learning_rate": 1e-06, + "loss": 0.9062, + "mean_token_accuracy": 0.7217670679092407, + "num_tokens": 248271303.0, + "step": 9806 + }, + { + "epoch": 1.0769822095321766, + "grad_norm": 2.6908154487609863, + "learning_rate": 1e-06, + "loss": 0.8908, + "mean_token_accuracy": 0.7185606360435486, + "num_tokens": 248291244.0, + "step": 9807 + }, + { + "epoch": 1.0770920272347904, + "grad_norm": 2.066070079803467, + "learning_rate": 1e-06, + "loss": 0.8201, + "mean_token_accuracy": 0.7442659139633179, + "num_tokens": 248318626.0, + "step": 9808 + }, + { + "epoch": 1.077201844937404, + "grad_norm": 1.9691786766052246, + "learning_rate": 1e-06, + "loss": 0.8833, + "mean_token_accuracy": 0.7254205942153931, + "num_tokens": 248350064.0, + "step": 9809 + }, + { + "epoch": 1.0773116626400177, + "grad_norm": 1.9827520847320557, + "learning_rate": 1e-06, + "loss": 0.8322, + "mean_token_accuracy": 0.7382688522338867, + "num_tokens": 248378610.0, + "step": 9810 + }, + { + "epoch": 1.0774214803426312, + "grad_norm": 1.8423973321914673, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.722033679485321, + "num_tokens": 248412680.0, + "step": 9811 + }, + { + "epoch": 1.077531298045245, + "grad_norm": 2.121858596801758, + "learning_rate": 1e-06, + "loss": 0.8418, + "mean_token_accuracy": 0.7298845052719116, + "num_tokens": 248438651.0, + "step": 9812 + }, + { + "epoch": 1.0776411157478585, + "grad_norm": 2.1495654582977295, + "learning_rate": 1e-06, + "loss": 0.8566, + "mean_token_accuracy": 0.7371784448623657, + "num_tokens": 248465836.0, + "step": 9813 + }, + { + "epoch": 1.0777509334504722, + "grad_norm": 2.1813712120056152, + "learning_rate": 1e-06, + "loss": 0.9441, + "mean_token_accuracy": 0.7136545181274414, + "num_tokens": 248491902.0, + "step": 9814 + }, + { + "epoch": 1.0778607511530858, + "grad_norm": 2.141939401626587, + "learning_rate": 1e-06, + "loss": 0.8552, + "mean_token_accuracy": 0.7272363901138306, + "num_tokens": 248517471.0, + "step": 9815 + }, + { + "epoch": 1.0779705688556995, + "grad_norm": 2.2509422302246094, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7165226936340332, + "num_tokens": 248543872.0, + "step": 9816 + }, + { + "epoch": 1.0780803865583133, + "grad_norm": 2.36887526512146, + "learning_rate": 1e-06, + "loss": 0.8589, + "mean_token_accuracy": 0.7289929389953613, + "num_tokens": 248567434.0, + "step": 9817 + }, + { + "epoch": 1.0781902042609268, + "grad_norm": 2.1524810791015625, + "learning_rate": 1e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7105391621589661, + "num_tokens": 248594201.0, + "step": 9818 + }, + { + "epoch": 1.0783000219635406, + "grad_norm": 2.372572898864746, + "learning_rate": 1e-06, + "loss": 0.8366, + "mean_token_accuracy": 0.7351568341255188, + "num_tokens": 248617163.0, + "step": 9819 + }, + { + "epoch": 1.0784098396661541, + "grad_norm": 2.1420035362243652, + "learning_rate": 1e-06, + "loss": 0.9012, + "mean_token_accuracy": 0.715024471282959, + "num_tokens": 248643554.0, + "step": 9820 + }, + { + "epoch": 1.0785196573687679, + "grad_norm": 2.3227243423461914, + "learning_rate": 1e-06, + "loss": 0.887, + "mean_token_accuracy": 0.725208044052124, + "num_tokens": 248668217.0, + "step": 9821 + }, + { + "epoch": 1.0786294750713814, + "grad_norm": 2.1845033168792725, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.6978979706764221, + "num_tokens": 248695877.0, + "step": 9822 + }, + { + "epoch": 1.0787392927739952, + "grad_norm": 2.2105443477630615, + "learning_rate": 1e-06, + "loss": 1.0043, + "mean_token_accuracy": 0.6923742294311523, + "num_tokens": 248724521.0, + "step": 9823 + }, + { + "epoch": 1.078849110476609, + "grad_norm": 1.8746132850646973, + "learning_rate": 1e-06, + "loss": 1.0056, + "mean_token_accuracy": 0.6893850564956665, + "num_tokens": 248759034.0, + "step": 9824 + }, + { + "epoch": 1.0789589281792225, + "grad_norm": 2.2656240463256836, + "learning_rate": 1e-06, + "loss": 0.8807, + "mean_token_accuracy": 0.7184303998947144, + "num_tokens": 248782514.0, + "step": 9825 + }, + { + "epoch": 1.0790687458818362, + "grad_norm": 2.4242708683013916, + "learning_rate": 1e-06, + "loss": 0.8965, + "mean_token_accuracy": 0.729724109172821, + "num_tokens": 248804592.0, + "step": 9826 + }, + { + "epoch": 1.0791785635844497, + "grad_norm": 2.4743332862854004, + "learning_rate": 1e-06, + "loss": 0.8456, + "mean_token_accuracy": 0.7430020570755005, + "num_tokens": 248826619.0, + "step": 9827 + }, + { + "epoch": 1.0792883812870635, + "grad_norm": 1.9934746026992798, + "learning_rate": 1e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7105915546417236, + "num_tokens": 248858779.0, + "step": 9828 + }, + { + "epoch": 1.079398198989677, + "grad_norm": 2.3261778354644775, + "learning_rate": 1e-06, + "loss": 0.8486, + "mean_token_accuracy": 0.7387728095054626, + "num_tokens": 248884525.0, + "step": 9829 + }, + { + "epoch": 1.0795080166922908, + "grad_norm": 2.1061649322509766, + "learning_rate": 1e-06, + "loss": 0.9856, + "mean_token_accuracy": 0.6997389793395996, + "num_tokens": 248914078.0, + "step": 9830 + }, + { + "epoch": 1.0796178343949046, + "grad_norm": 2.6473143100738525, + "learning_rate": 1e-06, + "loss": 0.8931, + "mean_token_accuracy": 0.7221829295158386, + "num_tokens": 248934793.0, + "step": 9831 + }, + { + "epoch": 1.079727652097518, + "grad_norm": 2.457850694656372, + "learning_rate": 1e-06, + "loss": 0.8686, + "mean_token_accuracy": 0.7279089689254761, + "num_tokens": 248956801.0, + "step": 9832 + }, + { + "epoch": 1.0798374698001318, + "grad_norm": 2.4142980575561523, + "learning_rate": 1e-06, + "loss": 0.8812, + "mean_token_accuracy": 0.7254618406295776, + "num_tokens": 248981816.0, + "step": 9833 + }, + { + "epoch": 1.0799472875027454, + "grad_norm": 2.130415439605713, + "learning_rate": 1e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.7182551026344299, + "num_tokens": 249009930.0, + "step": 9834 + }, + { + "epoch": 1.0800571052053591, + "grad_norm": 2.219026565551758, + "learning_rate": 1e-06, + "loss": 0.891, + "mean_token_accuracy": 0.725029468536377, + "num_tokens": 249037065.0, + "step": 9835 + }, + { + "epoch": 1.0801669229079727, + "grad_norm": 2.1414315700531006, + "learning_rate": 1e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.710169792175293, + "num_tokens": 249065937.0, + "step": 9836 + }, + { + "epoch": 1.0802767406105864, + "grad_norm": 2.4166014194488525, + "learning_rate": 1e-06, + "loss": 0.8017, + "mean_token_accuracy": 0.7354777455329895, + "num_tokens": 249086164.0, + "step": 9837 + }, + { + "epoch": 1.0803865583132002, + "grad_norm": 2.470210075378418, + "learning_rate": 1e-06, + "loss": 0.8508, + "mean_token_accuracy": 0.7345396876335144, + "num_tokens": 249107001.0, + "step": 9838 + }, + { + "epoch": 1.0804963760158137, + "grad_norm": 2.196866273880005, + "learning_rate": 1e-06, + "loss": 0.9946, + "mean_token_accuracy": 0.699277937412262, + "num_tokens": 249137186.0, + "step": 9839 + }, + { + "epoch": 1.0806061937184275, + "grad_norm": 2.1413865089416504, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7106915712356567, + "num_tokens": 249166168.0, + "step": 9840 + }, + { + "epoch": 1.080716011421041, + "grad_norm": 2.239595651626587, + "learning_rate": 1e-06, + "loss": 0.8475, + "mean_token_accuracy": 0.7442532181739807, + "num_tokens": 249190635.0, + "step": 9841 + }, + { + "epoch": 1.0808258291236548, + "grad_norm": 2.2863729000091553, + "learning_rate": 1e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.7307924032211304, + "num_tokens": 249216829.0, + "step": 9842 + }, + { + "epoch": 1.0809356468262683, + "grad_norm": 2.0936849117279053, + "learning_rate": 1e-06, + "loss": 0.8744, + "mean_token_accuracy": 0.7293778657913208, + "num_tokens": 249245452.0, + "step": 9843 + }, + { + "epoch": 1.081045464528882, + "grad_norm": 2.3874497413635254, + "learning_rate": 1e-06, + "loss": 0.94, + "mean_token_accuracy": 0.7125000953674316, + "num_tokens": 249268898.0, + "step": 9844 + }, + { + "epoch": 1.0811552822314958, + "grad_norm": 2.2611827850341797, + "learning_rate": 1e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.72400963306427, + "num_tokens": 249294911.0, + "step": 9845 + }, + { + "epoch": 1.0812650999341094, + "grad_norm": 2.5588464736938477, + "learning_rate": 1e-06, + "loss": 0.8406, + "mean_token_accuracy": 0.7399965524673462, + "num_tokens": 249315235.0, + "step": 9846 + }, + { + "epoch": 1.081374917636723, + "grad_norm": 2.3663887977600098, + "learning_rate": 1e-06, + "loss": 0.8813, + "mean_token_accuracy": 0.7346248030662537, + "num_tokens": 249338103.0, + "step": 9847 + }, + { + "epoch": 1.0814847353393366, + "grad_norm": 2.2978975772857666, + "learning_rate": 1e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.7171041965484619, + "num_tokens": 249363708.0, + "step": 9848 + }, + { + "epoch": 1.0815945530419504, + "grad_norm": 2.372418165206909, + "learning_rate": 1e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.7126611471176147, + "num_tokens": 249387206.0, + "step": 9849 + }, + { + "epoch": 1.081704370744564, + "grad_norm": 2.0722885131835938, + "learning_rate": 1e-06, + "loss": 0.954, + "mean_token_accuracy": 0.7066718935966492, + "num_tokens": 249415818.0, + "step": 9850 + }, + { + "epoch": 1.0818141884471777, + "grad_norm": 2.4804699420928955, + "learning_rate": 1e-06, + "loss": 0.8444, + "mean_token_accuracy": 0.7336956858634949, + "num_tokens": 249436801.0, + "step": 9851 + }, + { + "epoch": 1.0819240061497912, + "grad_norm": 2.0286178588867188, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.7046769857406616, + "num_tokens": 249466981.0, + "step": 9852 + }, + { + "epoch": 1.082033823852405, + "grad_norm": 2.239511489868164, + "learning_rate": 1e-06, + "loss": 0.8185, + "mean_token_accuracy": 0.7442629933357239, + "num_tokens": 249494981.0, + "step": 9853 + }, + { + "epoch": 1.0821436415550187, + "grad_norm": 2.8496932983398438, + "learning_rate": 1e-06, + "loss": 0.8736, + "mean_token_accuracy": 0.7251133918762207, + "num_tokens": 249512970.0, + "step": 9854 + }, + { + "epoch": 1.0822534592576323, + "grad_norm": 2.178034782409668, + "learning_rate": 1e-06, + "loss": 0.9242, + "mean_token_accuracy": 0.7170886993408203, + "num_tokens": 249540149.0, + "step": 9855 + }, + { + "epoch": 1.082363276960246, + "grad_norm": 1.9298585653305054, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7122325897216797, + "num_tokens": 249572347.0, + "step": 9856 + }, + { + "epoch": 1.0824730946628596, + "grad_norm": 2.2156050205230713, + "learning_rate": 1e-06, + "loss": 0.8958, + "mean_token_accuracy": 0.7190383076667786, + "num_tokens": 249598720.0, + "step": 9857 + }, + { + "epoch": 1.0825829123654733, + "grad_norm": 2.0537021160125732, + "learning_rate": 1e-06, + "loss": 1.0031, + "mean_token_accuracy": 0.6905226707458496, + "num_tokens": 249631442.0, + "step": 9858 + }, + { + "epoch": 1.082692730068087, + "grad_norm": 2.459063768386841, + "learning_rate": 1e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.7133271098136902, + "num_tokens": 249652915.0, + "step": 9859 + }, + { + "epoch": 1.0828025477707006, + "grad_norm": 1.9482178688049316, + "learning_rate": 1e-06, + "loss": 0.8329, + "mean_token_accuracy": 0.7386466860771179, + "num_tokens": 249683595.0, + "step": 9860 + }, + { + "epoch": 1.0829123654733144, + "grad_norm": 2.3681070804595947, + "learning_rate": 1e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.7041612863540649, + "num_tokens": 249708556.0, + "step": 9861 + }, + { + "epoch": 1.083022183175928, + "grad_norm": 2.089236259460449, + "learning_rate": 1e-06, + "loss": 0.907, + "mean_token_accuracy": 0.7189779877662659, + "num_tokens": 249735674.0, + "step": 9862 + }, + { + "epoch": 1.0831320008785417, + "grad_norm": 2.434856653213501, + "learning_rate": 1e-06, + "loss": 0.8448, + "mean_token_accuracy": 0.7335052490234375, + "num_tokens": 249759338.0, + "step": 9863 + }, + { + "epoch": 1.0832418185811552, + "grad_norm": 2.321260690689087, + "learning_rate": 1e-06, + "loss": 0.8838, + "mean_token_accuracy": 0.738493025302887, + "num_tokens": 249783764.0, + "step": 9864 + }, + { + "epoch": 1.083351636283769, + "grad_norm": 2.3607800006866455, + "learning_rate": 1e-06, + "loss": 0.8369, + "mean_token_accuracy": 0.738838791847229, + "num_tokens": 249805944.0, + "step": 9865 + }, + { + "epoch": 1.0834614539863825, + "grad_norm": 2.4968574047088623, + "learning_rate": 1e-06, + "loss": 0.7396, + "mean_token_accuracy": 0.7576614618301392, + "num_tokens": 249825470.0, + "step": 9866 + }, + { + "epoch": 1.0835712716889963, + "grad_norm": 2.2448716163635254, + "learning_rate": 1e-06, + "loss": 0.8341, + "mean_token_accuracy": 0.734520435333252, + "num_tokens": 249849344.0, + "step": 9867 + }, + { + "epoch": 1.08368108939161, + "grad_norm": 2.538480758666992, + "learning_rate": 1e-06, + "loss": 0.8638, + "mean_token_accuracy": 0.7274686098098755, + "num_tokens": 249869582.0, + "step": 9868 + }, + { + "epoch": 1.0837909070942235, + "grad_norm": 2.130082845687866, + "learning_rate": 1e-06, + "loss": 0.8718, + "mean_token_accuracy": 0.7290595173835754, + "num_tokens": 249897851.0, + "step": 9869 + }, + { + "epoch": 1.0839007247968373, + "grad_norm": 2.222700834274292, + "learning_rate": 1e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.7056578993797302, + "num_tokens": 249923144.0, + "step": 9870 + }, + { + "epoch": 1.0840105424994508, + "grad_norm": 1.989417314529419, + "learning_rate": 1e-06, + "loss": 0.8358, + "mean_token_accuracy": 0.7429752945899963, + "num_tokens": 249950395.0, + "step": 9871 + }, + { + "epoch": 1.0841203602020646, + "grad_norm": 2.594764471054077, + "learning_rate": 1e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.7038764953613281, + "num_tokens": 249970464.0, + "step": 9872 + }, + { + "epoch": 1.0842301779046783, + "grad_norm": 2.4802401065826416, + "learning_rate": 1e-06, + "loss": 0.9871, + "mean_token_accuracy": 0.6987035274505615, + "num_tokens": 249993652.0, + "step": 9873 + }, + { + "epoch": 1.0843399956072919, + "grad_norm": 2.273852586746216, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7163749933242798, + "num_tokens": 250020633.0, + "step": 9874 + }, + { + "epoch": 1.0844498133099056, + "grad_norm": 2.2237021923065186, + "learning_rate": 1e-06, + "loss": 0.9905, + "mean_token_accuracy": 0.6989153623580933, + "num_tokens": 250046362.0, + "step": 9875 + }, + { + "epoch": 1.0845596310125192, + "grad_norm": 2.2626731395721436, + "learning_rate": 1e-06, + "loss": 0.9645, + "mean_token_accuracy": 0.7037127017974854, + "num_tokens": 250072778.0, + "step": 9876 + }, + { + "epoch": 1.084669448715133, + "grad_norm": 2.0425078868865967, + "learning_rate": 1e-06, + "loss": 1.0069, + "mean_token_accuracy": 0.6994622945785522, + "num_tokens": 250104589.0, + "step": 9877 + }, + { + "epoch": 1.0847792664177465, + "grad_norm": 2.2413315773010254, + "learning_rate": 1e-06, + "loss": 0.9661, + "mean_token_accuracy": 0.7002214193344116, + "num_tokens": 250132595.0, + "step": 9878 + }, + { + "epoch": 1.0848890841203602, + "grad_norm": 2.3446848392486572, + "learning_rate": 1e-06, + "loss": 0.8396, + "mean_token_accuracy": 0.7367686629295349, + "num_tokens": 250156836.0, + "step": 9879 + }, + { + "epoch": 1.0849989018229738, + "grad_norm": 2.0779294967651367, + "learning_rate": 1e-06, + "loss": 0.8689, + "mean_token_accuracy": 0.7284910678863525, + "num_tokens": 250184728.0, + "step": 9880 + }, + { + "epoch": 1.0851087195255875, + "grad_norm": 2.4478180408477783, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7175232172012329, + "num_tokens": 250206784.0, + "step": 9881 + }, + { + "epoch": 1.0852185372282013, + "grad_norm": 1.8789129257202148, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.7042356729507446, + "num_tokens": 250244457.0, + "step": 9882 + }, + { + "epoch": 1.0853283549308148, + "grad_norm": 2.555729389190674, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7212689518928528, + "num_tokens": 250266508.0, + "step": 9883 + }, + { + "epoch": 1.0854381726334286, + "grad_norm": 2.4001879692077637, + "learning_rate": 1e-06, + "loss": 0.9047, + "mean_token_accuracy": 0.7193325757980347, + "num_tokens": 250290524.0, + "step": 9884 + }, + { + "epoch": 1.085547990336042, + "grad_norm": 2.146458625793457, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7018752098083496, + "num_tokens": 250319176.0, + "step": 9885 + }, + { + "epoch": 1.0856578080386559, + "grad_norm": 2.3017489910125732, + "learning_rate": 1e-06, + "loss": 0.8989, + "mean_token_accuracy": 0.7193831205368042, + "num_tokens": 250343958.0, + "step": 9886 + }, + { + "epoch": 1.0857676257412694, + "grad_norm": 2.350637912750244, + "learning_rate": 1e-06, + "loss": 0.8274, + "mean_token_accuracy": 0.7436782121658325, + "num_tokens": 250366753.0, + "step": 9887 + }, + { + "epoch": 1.0858774434438832, + "grad_norm": 2.2656302452087402, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.7101246118545532, + "num_tokens": 250391593.0, + "step": 9888 + }, + { + "epoch": 1.085987261146497, + "grad_norm": 2.5304641723632812, + "learning_rate": 1e-06, + "loss": 0.7941, + "mean_token_accuracy": 0.7516040802001953, + "num_tokens": 250412395.0, + "step": 9889 + }, + { + "epoch": 1.0860970788491104, + "grad_norm": 2.236076831817627, + "learning_rate": 1e-06, + "loss": 0.8623, + "mean_token_accuracy": 0.7275128364562988, + "num_tokens": 250437494.0, + "step": 9890 + }, + { + "epoch": 1.0862068965517242, + "grad_norm": 2.332801580429077, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7024384140968323, + "num_tokens": 250460928.0, + "step": 9891 + }, + { + "epoch": 1.0863167142543377, + "grad_norm": 2.080505609512329, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.6987891793251038, + "num_tokens": 250491359.0, + "step": 9892 + }, + { + "epoch": 1.0864265319569515, + "grad_norm": 2.1738555431365967, + "learning_rate": 1e-06, + "loss": 0.9413, + "mean_token_accuracy": 0.706426739692688, + "num_tokens": 250518727.0, + "step": 9893 + }, + { + "epoch": 1.086536349659565, + "grad_norm": 2.2823832035064697, + "learning_rate": 1e-06, + "loss": 0.8771, + "mean_token_accuracy": 0.728723406791687, + "num_tokens": 250542894.0, + "step": 9894 + }, + { + "epoch": 1.0866461673621788, + "grad_norm": 3.049112558364868, + "learning_rate": 1e-06, + "loss": 0.8155, + "mean_token_accuracy": 0.7394169569015503, + "num_tokens": 250559190.0, + "step": 9895 + }, + { + "epoch": 1.0867559850647925, + "grad_norm": 2.1578359603881836, + "learning_rate": 1e-06, + "loss": 0.884, + "mean_token_accuracy": 0.7272627949714661, + "num_tokens": 250585820.0, + "step": 9896 + }, + { + "epoch": 1.086865802767406, + "grad_norm": 2.288660764694214, + "learning_rate": 1e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.7166399359703064, + "num_tokens": 250608784.0, + "step": 9897 + }, + { + "epoch": 1.0869756204700198, + "grad_norm": 2.1119778156280518, + "learning_rate": 1e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.724952220916748, + "num_tokens": 250634411.0, + "step": 9898 + }, + { + "epoch": 1.0870854381726334, + "grad_norm": 2.037013053894043, + "learning_rate": 1e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.7070389986038208, + "num_tokens": 250665999.0, + "step": 9899 + }, + { + "epoch": 1.0871952558752471, + "grad_norm": 2.440768003463745, + "learning_rate": 1e-06, + "loss": 0.8827, + "mean_token_accuracy": 0.7345567345619202, + "num_tokens": 250687846.0, + "step": 9900 + }, + { + "epoch": 1.0873050735778607, + "grad_norm": 2.1755716800689697, + "learning_rate": 1e-06, + "loss": 0.9209, + "mean_token_accuracy": 0.711503267288208, + "num_tokens": 250717011.0, + "step": 9901 + }, + { + "epoch": 1.0874148912804744, + "grad_norm": 2.290372133255005, + "learning_rate": 1e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.720306396484375, + "num_tokens": 250742558.0, + "step": 9902 + }, + { + "epoch": 1.0875247089830882, + "grad_norm": 2.248826265335083, + "learning_rate": 1e-06, + "loss": 0.9087, + "mean_token_accuracy": 0.7276605367660522, + "num_tokens": 250768388.0, + "step": 9903 + }, + { + "epoch": 1.0876345266857017, + "grad_norm": 2.120119094848633, + "learning_rate": 1e-06, + "loss": 0.9127, + "mean_token_accuracy": 0.7177270650863647, + "num_tokens": 250797954.0, + "step": 9904 + }, + { + "epoch": 1.0877443443883155, + "grad_norm": 2.141404390335083, + "learning_rate": 1e-06, + "loss": 0.9032, + "mean_token_accuracy": 0.7164778709411621, + "num_tokens": 250825180.0, + "step": 9905 + }, + { + "epoch": 1.087854162090929, + "grad_norm": 2.2461206912994385, + "learning_rate": 1e-06, + "loss": 0.8727, + "mean_token_accuracy": 0.7258932590484619, + "num_tokens": 250851021.0, + "step": 9906 + }, + { + "epoch": 1.0879639797935428, + "grad_norm": 2.333407163619995, + "learning_rate": 1e-06, + "loss": 0.8499, + "mean_token_accuracy": 0.7321629524230957, + "num_tokens": 250874358.0, + "step": 9907 + }, + { + "epoch": 1.0880737974961563, + "grad_norm": 1.9811617136001587, + "learning_rate": 1e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.7271469235420227, + "num_tokens": 250907091.0, + "step": 9908 + }, + { + "epoch": 1.08818361519877, + "grad_norm": 2.075878143310547, + "learning_rate": 1e-06, + "loss": 0.8424, + "mean_token_accuracy": 0.7368238568305969, + "num_tokens": 250935750.0, + "step": 9909 + }, + { + "epoch": 1.0882934329013838, + "grad_norm": 2.2333738803863525, + "learning_rate": 1e-06, + "loss": 0.971, + "mean_token_accuracy": 0.7051442861557007, + "num_tokens": 250962800.0, + "step": 9910 + }, + { + "epoch": 1.0884032506039973, + "grad_norm": 2.178488254547119, + "learning_rate": 1e-06, + "loss": 0.8715, + "mean_token_accuracy": 0.7250815033912659, + "num_tokens": 250989349.0, + "step": 9911 + }, + { + "epoch": 1.088513068306611, + "grad_norm": 2.4109206199645996, + "learning_rate": 1e-06, + "loss": 0.8714, + "mean_token_accuracy": 0.7297115325927734, + "num_tokens": 251010902.0, + "step": 9912 + }, + { + "epoch": 1.0886228860092246, + "grad_norm": 2.3533246517181396, + "learning_rate": 1e-06, + "loss": 0.8312, + "mean_token_accuracy": 0.7390998005867004, + "num_tokens": 251032783.0, + "step": 9913 + }, + { + "epoch": 1.0887327037118384, + "grad_norm": 2.109630823135376, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7175682783126831, + "num_tokens": 251062050.0, + "step": 9914 + }, + { + "epoch": 1.088842521414452, + "grad_norm": 2.209017753601074, + "learning_rate": 1e-06, + "loss": 0.92, + "mean_token_accuracy": 0.7147915959358215, + "num_tokens": 251088658.0, + "step": 9915 + }, + { + "epoch": 1.0889523391170657, + "grad_norm": 2.158860445022583, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7113361954689026, + "num_tokens": 251120020.0, + "step": 9916 + }, + { + "epoch": 1.0890621568196792, + "grad_norm": 2.190626382827759, + "learning_rate": 1e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.7214410901069641, + "num_tokens": 251145789.0, + "step": 9917 + }, + { + "epoch": 1.089171974522293, + "grad_norm": 2.6410202980041504, + "learning_rate": 1e-06, + "loss": 0.901, + "mean_token_accuracy": 0.7202934622764587, + "num_tokens": 251165093.0, + "step": 9918 + }, + { + "epoch": 1.0892817922249067, + "grad_norm": 2.341370105743408, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.7043102979660034, + "num_tokens": 251191889.0, + "step": 9919 + }, + { + "epoch": 1.0893916099275203, + "grad_norm": 2.3120503425598145, + "learning_rate": 1e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7100785970687866, + "num_tokens": 251217551.0, + "step": 9920 + }, + { + "epoch": 1.089501427630134, + "grad_norm": 2.2419307231903076, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.713767945766449, + "num_tokens": 251244193.0, + "step": 9921 + }, + { + "epoch": 1.0896112453327476, + "grad_norm": 2.0746400356292725, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.7101303935050964, + "num_tokens": 251274101.0, + "step": 9922 + }, + { + "epoch": 1.0897210630353613, + "grad_norm": 2.2927844524383545, + "learning_rate": 1e-06, + "loss": 0.8215, + "mean_token_accuracy": 0.7410891056060791, + "num_tokens": 251297583.0, + "step": 9923 + }, + { + "epoch": 1.089830880737975, + "grad_norm": 2.2153398990631104, + "learning_rate": 1e-06, + "loss": 0.892, + "mean_token_accuracy": 0.7189652919769287, + "num_tokens": 251323213.0, + "step": 9924 + }, + { + "epoch": 1.0899406984405886, + "grad_norm": 2.4458775520324707, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.710419237613678, + "num_tokens": 251346978.0, + "step": 9925 + }, + { + "epoch": 1.0900505161432024, + "grad_norm": 2.0958750247955322, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7114900946617126, + "num_tokens": 251375338.0, + "step": 9926 + }, + { + "epoch": 1.090160333845816, + "grad_norm": 2.12973690032959, + "learning_rate": 1e-06, + "loss": 1.0413, + "mean_token_accuracy": 0.677607536315918, + "num_tokens": 251406797.0, + "step": 9927 + }, + { + "epoch": 1.0902701515484297, + "grad_norm": 2.199235439300537, + "learning_rate": 1e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.6979209184646606, + "num_tokens": 251433868.0, + "step": 9928 + }, + { + "epoch": 1.0903799692510432, + "grad_norm": 2.40022873878479, + "learning_rate": 1e-06, + "loss": 0.8399, + "mean_token_accuracy": 0.7345188856124878, + "num_tokens": 251455604.0, + "step": 9929 + }, + { + "epoch": 1.090489786953657, + "grad_norm": 2.208258628845215, + "learning_rate": 1e-06, + "loss": 0.9073, + "mean_token_accuracy": 0.7160606384277344, + "num_tokens": 251483762.0, + "step": 9930 + }, + { + "epoch": 1.0905996046562705, + "grad_norm": 2.16169810295105, + "learning_rate": 1e-06, + "loss": 0.8771, + "mean_token_accuracy": 0.7234076261520386, + "num_tokens": 251508893.0, + "step": 9931 + }, + { + "epoch": 1.0907094223588842, + "grad_norm": 2.3030426502227783, + "learning_rate": 1e-06, + "loss": 0.9779, + "mean_token_accuracy": 0.7037116289138794, + "num_tokens": 251535857.0, + "step": 9932 + }, + { + "epoch": 1.090819240061498, + "grad_norm": 2.2166483402252197, + "learning_rate": 1e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.7093695402145386, + "num_tokens": 251564061.0, + "step": 9933 + }, + { + "epoch": 1.0909290577641115, + "grad_norm": 2.1084401607513428, + "learning_rate": 1e-06, + "loss": 0.8091, + "mean_token_accuracy": 0.743854820728302, + "num_tokens": 251589497.0, + "step": 9934 + }, + { + "epoch": 1.0910388754667253, + "grad_norm": 2.086435556411743, + "learning_rate": 1e-06, + "loss": 0.978, + "mean_token_accuracy": 0.6956474781036377, + "num_tokens": 251618697.0, + "step": 9935 + }, + { + "epoch": 1.0911486931693388, + "grad_norm": 2.3376169204711914, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7053329348564148, + "num_tokens": 251642230.0, + "step": 9936 + }, + { + "epoch": 1.0912585108719526, + "grad_norm": 2.254330635070801, + "learning_rate": 1e-06, + "loss": 0.8835, + "mean_token_accuracy": 0.7289338707923889, + "num_tokens": 251668566.0, + "step": 9937 + }, + { + "epoch": 1.0913683285745663, + "grad_norm": 2.2695302963256836, + "learning_rate": 1e-06, + "loss": 0.909, + "mean_token_accuracy": 0.719545841217041, + "num_tokens": 251693979.0, + "step": 9938 + }, + { + "epoch": 1.0914781462771799, + "grad_norm": 2.33341908454895, + "learning_rate": 1e-06, + "loss": 0.9282, + "mean_token_accuracy": 0.7133138179779053, + "num_tokens": 251718927.0, + "step": 9939 + }, + { + "epoch": 1.0915879639797936, + "grad_norm": 2.375554084777832, + "learning_rate": 1e-06, + "loss": 1.0177, + "mean_token_accuracy": 0.6916115283966064, + "num_tokens": 251744073.0, + "step": 9940 + }, + { + "epoch": 1.0916977816824072, + "grad_norm": 2.5391430854797363, + "learning_rate": 1e-06, + "loss": 0.9209, + "mean_token_accuracy": 0.715043842792511, + "num_tokens": 251765812.0, + "step": 9941 + }, + { + "epoch": 1.091807599385021, + "grad_norm": 2.1439120769500732, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.7063207626342773, + "num_tokens": 251792758.0, + "step": 9942 + }, + { + "epoch": 1.0919174170876345, + "grad_norm": 2.274569511413574, + "learning_rate": 1e-06, + "loss": 0.8791, + "mean_token_accuracy": 0.7209393978118896, + "num_tokens": 251817767.0, + "step": 9943 + }, + { + "epoch": 1.0920272347902482, + "grad_norm": 2.255298137664795, + "learning_rate": 1e-06, + "loss": 1.0004, + "mean_token_accuracy": 0.6951934695243835, + "num_tokens": 251845563.0, + "step": 9944 + }, + { + "epoch": 1.0921370524928617, + "grad_norm": 2.252375841140747, + "learning_rate": 1e-06, + "loss": 0.9166, + "mean_token_accuracy": 0.714750349521637, + "num_tokens": 251869567.0, + "step": 9945 + }, + { + "epoch": 1.0922468701954755, + "grad_norm": 2.2655298709869385, + "learning_rate": 1e-06, + "loss": 0.8942, + "mean_token_accuracy": 0.723560094833374, + "num_tokens": 251896116.0, + "step": 9946 + }, + { + "epoch": 1.0923566878980893, + "grad_norm": 2.093575954437256, + "learning_rate": 1e-06, + "loss": 0.9643, + "mean_token_accuracy": 0.7133086919784546, + "num_tokens": 251925283.0, + "step": 9947 + }, + { + "epoch": 1.0924665056007028, + "grad_norm": 2.2017314434051514, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7145835161209106, + "num_tokens": 251952668.0, + "step": 9948 + }, + { + "epoch": 1.0925763233033166, + "grad_norm": 2.2087128162384033, + "learning_rate": 1e-06, + "loss": 0.8647, + "mean_token_accuracy": 0.7329721450805664, + "num_tokens": 251978272.0, + "step": 9949 + }, + { + "epoch": 1.09268614100593, + "grad_norm": 2.475449562072754, + "learning_rate": 1e-06, + "loss": 0.8465, + "mean_token_accuracy": 0.7251386642456055, + "num_tokens": 251999476.0, + "step": 9950 + }, + { + "epoch": 1.0927959587085438, + "grad_norm": 2.469285249710083, + "learning_rate": 1e-06, + "loss": 0.8812, + "mean_token_accuracy": 0.7248811721801758, + "num_tokens": 252020555.0, + "step": 9951 + }, + { + "epoch": 1.0929057764111574, + "grad_norm": 2.2710468769073486, + "learning_rate": 1e-06, + "loss": 0.8594, + "mean_token_accuracy": 0.724797248840332, + "num_tokens": 252044320.0, + "step": 9952 + }, + { + "epoch": 1.0930155941137711, + "grad_norm": 2.4864492416381836, + "learning_rate": 1e-06, + "loss": 0.8769, + "mean_token_accuracy": 0.721386730670929, + "num_tokens": 252066047.0, + "step": 9953 + }, + { + "epoch": 1.093125411816385, + "grad_norm": 2.469773292541504, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7107696533203125, + "num_tokens": 252089946.0, + "step": 9954 + }, + { + "epoch": 1.0932352295189984, + "grad_norm": 2.74432635307312, + "learning_rate": 1e-06, + "loss": 0.8651, + "mean_token_accuracy": 0.7421044111251831, + "num_tokens": 252107389.0, + "step": 9955 + }, + { + "epoch": 1.0933450472216122, + "grad_norm": 2.4528236389160156, + "learning_rate": 1e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.7117721438407898, + "num_tokens": 252129007.0, + "step": 9956 + }, + { + "epoch": 1.0934548649242257, + "grad_norm": 2.173532247543335, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.7073737978935242, + "num_tokens": 252157642.0, + "step": 9957 + }, + { + "epoch": 1.0935646826268395, + "grad_norm": 2.1719722747802734, + "learning_rate": 1e-06, + "loss": 0.9191, + "mean_token_accuracy": 0.7190819978713989, + "num_tokens": 252184545.0, + "step": 9958 + }, + { + "epoch": 1.093674500329453, + "grad_norm": 2.0387840270996094, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7202675342559814, + "num_tokens": 252216041.0, + "step": 9959 + }, + { + "epoch": 1.0937843180320668, + "grad_norm": 2.449307441711426, + "learning_rate": 1e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.7289942502975464, + "num_tokens": 252238485.0, + "step": 9960 + }, + { + "epoch": 1.0938941357346805, + "grad_norm": 2.151638984680176, + "learning_rate": 1e-06, + "loss": 0.9253, + "mean_token_accuracy": 0.7136145830154419, + "num_tokens": 252266943.0, + "step": 9961 + }, + { + "epoch": 1.094003953437294, + "grad_norm": 2.3650317192077637, + "learning_rate": 1e-06, + "loss": 0.8253, + "mean_token_accuracy": 0.7334977388381958, + "num_tokens": 252290387.0, + "step": 9962 + }, + { + "epoch": 1.0941137711399078, + "grad_norm": 2.372191905975342, + "learning_rate": 1e-06, + "loss": 0.8603, + "mean_token_accuracy": 0.7322217226028442, + "num_tokens": 252313521.0, + "step": 9963 + }, + { + "epoch": 1.0942235888425214, + "grad_norm": 2.461843252182007, + "learning_rate": 1e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.7238644957542419, + "num_tokens": 252336556.0, + "step": 9964 + }, + { + "epoch": 1.0943334065451351, + "grad_norm": 2.0436670780181885, + "learning_rate": 1e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.7097086310386658, + "num_tokens": 252366023.0, + "step": 9965 + }, + { + "epoch": 1.0944432242477486, + "grad_norm": 2.520517587661743, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.7152982950210571, + "num_tokens": 252387684.0, + "step": 9966 + }, + { + "epoch": 1.0945530419503624, + "grad_norm": 2.208726406097412, + "learning_rate": 1e-06, + "loss": 0.9046, + "mean_token_accuracy": 0.712428867816925, + "num_tokens": 252414239.0, + "step": 9967 + }, + { + "epoch": 1.0946628596529762, + "grad_norm": 2.1211390495300293, + "learning_rate": 1e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.7195781469345093, + "num_tokens": 252442271.0, + "step": 9968 + }, + { + "epoch": 1.0947726773555897, + "grad_norm": 2.325277328491211, + "learning_rate": 1e-06, + "loss": 0.8924, + "mean_token_accuracy": 0.7256177067756653, + "num_tokens": 252466733.0, + "step": 9969 + }, + { + "epoch": 1.0948824950582035, + "grad_norm": 2.3736791610717773, + "learning_rate": 1e-06, + "loss": 0.8969, + "mean_token_accuracy": 0.7324340343475342, + "num_tokens": 252487958.0, + "step": 9970 + }, + { + "epoch": 1.094992312760817, + "grad_norm": 2.3441686630249023, + "learning_rate": 1e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.7241129875183105, + "num_tokens": 252513852.0, + "step": 9971 + }, + { + "epoch": 1.0951021304634307, + "grad_norm": 2.5700583457946777, + "learning_rate": 1e-06, + "loss": 0.8486, + "mean_token_accuracy": 0.7312885522842407, + "num_tokens": 252533872.0, + "step": 9972 + }, + { + "epoch": 1.0952119481660443, + "grad_norm": 2.342421054840088, + "learning_rate": 1e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.7243757843971252, + "num_tokens": 252557830.0, + "step": 9973 + }, + { + "epoch": 1.095321765868658, + "grad_norm": 2.26294207572937, + "learning_rate": 1e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.7152090072631836, + "num_tokens": 252583048.0, + "step": 9974 + }, + { + "epoch": 1.0954315835712718, + "grad_norm": 2.3301479816436768, + "learning_rate": 1e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.6959453821182251, + "num_tokens": 252610746.0, + "step": 9975 + }, + { + "epoch": 1.0955414012738853, + "grad_norm": 2.1863315105438232, + "learning_rate": 1e-06, + "loss": 0.9244, + "mean_token_accuracy": 0.7162644267082214, + "num_tokens": 252639539.0, + "step": 9976 + }, + { + "epoch": 1.095651218976499, + "grad_norm": 2.246549129486084, + "learning_rate": 1e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.7144955396652222, + "num_tokens": 252667053.0, + "step": 9977 + }, + { + "epoch": 1.0957610366791126, + "grad_norm": 2.446511745452881, + "learning_rate": 1e-06, + "loss": 0.8902, + "mean_token_accuracy": 0.7195621728897095, + "num_tokens": 252689527.0, + "step": 9978 + }, + { + "epoch": 1.0958708543817264, + "grad_norm": 2.2647154331207275, + "learning_rate": 1e-06, + "loss": 0.8125, + "mean_token_accuracy": 0.7421761751174927, + "num_tokens": 252715002.0, + "step": 9979 + }, + { + "epoch": 1.09598067208434, + "grad_norm": 2.122601270675659, + "learning_rate": 1e-06, + "loss": 0.983, + "mean_token_accuracy": 0.7023401856422424, + "num_tokens": 252744026.0, + "step": 9980 + }, + { + "epoch": 1.0960904897869537, + "grad_norm": 2.0988306999206543, + "learning_rate": 1e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.705636739730835, + "num_tokens": 252772654.0, + "step": 9981 + }, + { + "epoch": 1.0962003074895672, + "grad_norm": 2.2953555583953857, + "learning_rate": 1e-06, + "loss": 0.8836, + "mean_token_accuracy": 0.7312892079353333, + "num_tokens": 252796442.0, + "step": 9982 + }, + { + "epoch": 1.096310125192181, + "grad_norm": 2.3164772987365723, + "learning_rate": 1e-06, + "loss": 0.9189, + "mean_token_accuracy": 0.7275611758232117, + "num_tokens": 252819471.0, + "step": 9983 + }, + { + "epoch": 1.0964199428947947, + "grad_norm": 2.010242223739624, + "learning_rate": 1e-06, + "loss": 0.9494, + "mean_token_accuracy": 0.7118578553199768, + "num_tokens": 252851498.0, + "step": 9984 + }, + { + "epoch": 1.0965297605974083, + "grad_norm": 2.0977652072906494, + "learning_rate": 1e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.7058883905410767, + "num_tokens": 252883038.0, + "step": 9985 + }, + { + "epoch": 1.096639578300022, + "grad_norm": 2.1040287017822266, + "learning_rate": 1e-06, + "loss": 0.8856, + "mean_token_accuracy": 0.7234109044075012, + "num_tokens": 252912351.0, + "step": 9986 + }, + { + "epoch": 1.0967493960026355, + "grad_norm": 2.4310696125030518, + "learning_rate": 1e-06, + "loss": 0.9365, + "mean_token_accuracy": 0.7088415622711182, + "num_tokens": 252936329.0, + "step": 9987 + }, + { + "epoch": 1.0968592137052493, + "grad_norm": 1.91621732711792, + "learning_rate": 1e-06, + "loss": 0.897, + "mean_token_accuracy": 0.7201015949249268, + "num_tokens": 252967049.0, + "step": 9988 + }, + { + "epoch": 1.096969031407863, + "grad_norm": 2.436145782470703, + "learning_rate": 1e-06, + "loss": 0.9229, + "mean_token_accuracy": 0.7189659476280212, + "num_tokens": 252990088.0, + "step": 9989 + }, + { + "epoch": 1.0970788491104766, + "grad_norm": 2.448798179626465, + "learning_rate": 1e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.7045989036560059, + "num_tokens": 253014211.0, + "step": 9990 + }, + { + "epoch": 1.0971886668130904, + "grad_norm": 2.398097515106201, + "learning_rate": 1e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.7072539925575256, + "num_tokens": 253039724.0, + "step": 9991 + }, + { + "epoch": 1.0972984845157039, + "grad_norm": 2.202446222305298, + "learning_rate": 1e-06, + "loss": 0.7665, + "mean_token_accuracy": 0.7604851126670837, + "num_tokens": 253067762.0, + "step": 9992 + }, + { + "epoch": 1.0974083022183176, + "grad_norm": 2.7075092792510986, + "learning_rate": 1e-06, + "loss": 0.9097, + "mean_token_accuracy": 0.7205118536949158, + "num_tokens": 253088011.0, + "step": 9993 + }, + { + "epoch": 1.0975181199209312, + "grad_norm": 2.077238082885742, + "learning_rate": 1e-06, + "loss": 0.8865, + "mean_token_accuracy": 0.7200727462768555, + "num_tokens": 253115209.0, + "step": 9994 + }, + { + "epoch": 1.097627937623545, + "grad_norm": 1.9598846435546875, + "learning_rate": 1e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.7292287349700928, + "num_tokens": 253146436.0, + "step": 9995 + }, + { + "epoch": 1.0977377553261585, + "grad_norm": 2.553212881088257, + "learning_rate": 1e-06, + "loss": 0.8303, + "mean_token_accuracy": 0.7366660237312317, + "num_tokens": 253165662.0, + "step": 9996 + }, + { + "epoch": 1.0978475730287722, + "grad_norm": 2.7082865238189697, + "learning_rate": 1e-06, + "loss": 0.8934, + "mean_token_accuracy": 0.7222184538841248, + "num_tokens": 253186734.0, + "step": 9997 + }, + { + "epoch": 1.097957390731386, + "grad_norm": 2.207965135574341, + "learning_rate": 1e-06, + "loss": 0.9182, + "mean_token_accuracy": 0.7134159207344055, + "num_tokens": 253212409.0, + "step": 9998 + }, + { + "epoch": 1.0980672084339995, + "grad_norm": 2.0938355922698975, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7174111604690552, + "num_tokens": 253242040.0, + "step": 9999 + }, + { + "epoch": 1.0981770261366133, + "grad_norm": 2.110985517501831, + "learning_rate": 1e-06, + "loss": 0.8678, + "mean_token_accuracy": 0.7309757471084595, + "num_tokens": 253269200.0, + "step": 10000 + }, + { + "epoch": 1.0982868438392268, + "grad_norm": 2.4318699836730957, + "learning_rate": 1e-06, + "loss": 0.8735, + "mean_token_accuracy": 0.722759485244751, + "num_tokens": 253291383.0, + "step": 10001 + }, + { + "epoch": 1.0983966615418406, + "grad_norm": 2.31109881401062, + "learning_rate": 1e-06, + "loss": 0.968, + "mean_token_accuracy": 0.7059028148651123, + "num_tokens": 253318068.0, + "step": 10002 + }, + { + "epoch": 1.098506479244454, + "grad_norm": 1.7435052394866943, + "learning_rate": 1e-06, + "loss": 0.8969, + "mean_token_accuracy": 0.7229461669921875, + "num_tokens": 253357361.0, + "step": 10003 + }, + { + "epoch": 1.0986162969470679, + "grad_norm": 2.5361216068267822, + "learning_rate": 1e-06, + "loss": 0.8804, + "mean_token_accuracy": 0.7225140333175659, + "num_tokens": 253378044.0, + "step": 10004 + }, + { + "epoch": 1.0987261146496816, + "grad_norm": 2.1605658531188965, + "learning_rate": 1e-06, + "loss": 0.8555, + "mean_token_accuracy": 0.7380480766296387, + "num_tokens": 253403507.0, + "step": 10005 + }, + { + "epoch": 1.0988359323522952, + "grad_norm": 2.5502476692199707, + "learning_rate": 1e-06, + "loss": 0.8213, + "mean_token_accuracy": 0.7404618263244629, + "num_tokens": 253422412.0, + "step": 10006 + }, + { + "epoch": 1.098945750054909, + "grad_norm": 2.1772634983062744, + "learning_rate": 1e-06, + "loss": 0.9084, + "mean_token_accuracy": 0.7174438238143921, + "num_tokens": 253448293.0, + "step": 10007 + }, + { + "epoch": 1.0990555677575224, + "grad_norm": 2.3405473232269287, + "learning_rate": 1e-06, + "loss": 0.8351, + "mean_token_accuracy": 0.73581862449646, + "num_tokens": 253471224.0, + "step": 10008 + }, + { + "epoch": 1.0991653854601362, + "grad_norm": 2.3822665214538574, + "learning_rate": 1e-06, + "loss": 0.9323, + "mean_token_accuracy": 0.7125421762466431, + "num_tokens": 253493817.0, + "step": 10009 + }, + { + "epoch": 1.0992752031627497, + "grad_norm": 2.400878429412842, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.708314061164856, + "num_tokens": 253519336.0, + "step": 10010 + }, + { + "epoch": 1.0993850208653635, + "grad_norm": 2.17268443107605, + "learning_rate": 1e-06, + "loss": 1.0163, + "mean_token_accuracy": 0.6879543662071228, + "num_tokens": 253551433.0, + "step": 10011 + }, + { + "epoch": 1.0994948385679773, + "grad_norm": 2.4196488857269287, + "learning_rate": 1e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.7166019082069397, + "num_tokens": 253574486.0, + "step": 10012 + }, + { + "epoch": 1.0996046562705908, + "grad_norm": 1.9790834188461304, + "learning_rate": 1e-06, + "loss": 0.9135, + "mean_token_accuracy": 0.7167668342590332, + "num_tokens": 253603833.0, + "step": 10013 + }, + { + "epoch": 1.0997144739732045, + "grad_norm": 2.2314095497131348, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.714341938495636, + "num_tokens": 253629503.0, + "step": 10014 + }, + { + "epoch": 1.099824291675818, + "grad_norm": 2.6861460208892822, + "learning_rate": 1e-06, + "loss": 0.824, + "mean_token_accuracy": 0.7410274744033813, + "num_tokens": 253647854.0, + "step": 10015 + }, + { + "epoch": 1.0999341093784318, + "grad_norm": 2.2517127990722656, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7024637460708618, + "num_tokens": 253675284.0, + "step": 10016 + }, + { + "epoch": 1.1000439270810454, + "grad_norm": 2.3627066612243652, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.7029350996017456, + "num_tokens": 253700886.0, + "step": 10017 + }, + { + "epoch": 1.1001537447836591, + "grad_norm": 2.218653917312622, + "learning_rate": 1e-06, + "loss": 0.8983, + "mean_token_accuracy": 0.7211905717849731, + "num_tokens": 253727295.0, + "step": 10018 + }, + { + "epoch": 1.1002635624862729, + "grad_norm": 2.537853717803955, + "learning_rate": 1e-06, + "loss": 0.8573, + "mean_token_accuracy": 0.7306474447250366, + "num_tokens": 253747907.0, + "step": 10019 + }, + { + "epoch": 1.1003733801888864, + "grad_norm": 2.278613567352295, + "learning_rate": 1e-06, + "loss": 0.8989, + "mean_token_accuracy": 0.7207896113395691, + "num_tokens": 253773876.0, + "step": 10020 + }, + { + "epoch": 1.1004831978915002, + "grad_norm": 2.482823371887207, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7111545205116272, + "num_tokens": 253798645.0, + "step": 10021 + }, + { + "epoch": 1.1005930155941137, + "grad_norm": 2.0218305587768555, + "learning_rate": 1e-06, + "loss": 0.88, + "mean_token_accuracy": 0.7241706848144531, + "num_tokens": 253827731.0, + "step": 10022 + }, + { + "epoch": 1.1007028332967275, + "grad_norm": 2.1685075759887695, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.702418327331543, + "num_tokens": 253853877.0, + "step": 10023 + }, + { + "epoch": 1.100812650999341, + "grad_norm": 2.322080135345459, + "learning_rate": 1e-06, + "loss": 0.902, + "mean_token_accuracy": 0.7226065993309021, + "num_tokens": 253876316.0, + "step": 10024 + }, + { + "epoch": 1.1009224687019548, + "grad_norm": 2.728715181350708, + "learning_rate": 1e-06, + "loss": 0.8779, + "mean_token_accuracy": 0.7331889867782593, + "num_tokens": 253893820.0, + "step": 10025 + }, + { + "epoch": 1.1010322864045685, + "grad_norm": 2.5552947521209717, + "learning_rate": 1e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7154387831687927, + "num_tokens": 253914828.0, + "step": 10026 + }, + { + "epoch": 1.101142104107182, + "grad_norm": 2.3562171459198, + "learning_rate": 1e-06, + "loss": 0.8871, + "mean_token_accuracy": 0.7237609028816223, + "num_tokens": 253938608.0, + "step": 10027 + }, + { + "epoch": 1.1012519218097958, + "grad_norm": 2.6501119136810303, + "learning_rate": 1e-06, + "loss": 0.8708, + "mean_token_accuracy": 0.724594235420227, + "num_tokens": 253958597.0, + "step": 10028 + }, + { + "epoch": 1.1013617395124093, + "grad_norm": 2.2234201431274414, + "learning_rate": 1e-06, + "loss": 0.8928, + "mean_token_accuracy": 0.7232571840286255, + "num_tokens": 253984212.0, + "step": 10029 + }, + { + "epoch": 1.101471557215023, + "grad_norm": 1.920287013053894, + "learning_rate": 1e-06, + "loss": 1.0104, + "mean_token_accuracy": 0.7022098898887634, + "num_tokens": 254016845.0, + "step": 10030 + }, + { + "epoch": 1.1015813749176366, + "grad_norm": 1.8647825717926025, + "learning_rate": 1e-06, + "loss": 1.1145, + "mean_token_accuracy": 0.663337230682373, + "num_tokens": 254052225.0, + "step": 10031 + }, + { + "epoch": 1.1016911926202504, + "grad_norm": 2.165041923522949, + "learning_rate": 1e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7178544998168945, + "num_tokens": 254078381.0, + "step": 10032 + }, + { + "epoch": 1.1018010103228641, + "grad_norm": 2.4417145252227783, + "learning_rate": 1e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.7174909114837646, + "num_tokens": 254100357.0, + "step": 10033 + }, + { + "epoch": 1.1019108280254777, + "grad_norm": 2.2619576454162598, + "learning_rate": 1e-06, + "loss": 0.9229, + "mean_token_accuracy": 0.7078821659088135, + "num_tokens": 254126519.0, + "step": 10034 + }, + { + "epoch": 1.1020206457280914, + "grad_norm": 1.9795950651168823, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.707598090171814, + "num_tokens": 254158144.0, + "step": 10035 + }, + { + "epoch": 1.102130463430705, + "grad_norm": 2.763840436935425, + "learning_rate": 1e-06, + "loss": 0.8771, + "mean_token_accuracy": 0.720489501953125, + "num_tokens": 254176884.0, + "step": 10036 + }, + { + "epoch": 1.1022402811333187, + "grad_norm": 2.6021788120269775, + "learning_rate": 1e-06, + "loss": 0.8706, + "mean_token_accuracy": 0.7329205274581909, + "num_tokens": 254196902.0, + "step": 10037 + }, + { + "epoch": 1.1023500988359323, + "grad_norm": 2.484055757522583, + "learning_rate": 1e-06, + "loss": 0.8403, + "mean_token_accuracy": 0.7418544292449951, + "num_tokens": 254218848.0, + "step": 10038 + }, + { + "epoch": 1.102459916538546, + "grad_norm": 2.0318431854248047, + "learning_rate": 1e-06, + "loss": 0.8472, + "mean_token_accuracy": 0.7315465211868286, + "num_tokens": 254246153.0, + "step": 10039 + }, + { + "epoch": 1.1025697342411598, + "grad_norm": 2.2338216304779053, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7169123888015747, + "num_tokens": 254272198.0, + "step": 10040 + }, + { + "epoch": 1.1026795519437733, + "grad_norm": 2.1654343605041504, + "learning_rate": 1e-06, + "loss": 0.8965, + "mean_token_accuracy": 0.7170671820640564, + "num_tokens": 254299757.0, + "step": 10041 + }, + { + "epoch": 1.102789369646387, + "grad_norm": 2.17844295501709, + "learning_rate": 1e-06, + "loss": 0.8927, + "mean_token_accuracy": 0.7244620323181152, + "num_tokens": 254324041.0, + "step": 10042 + }, + { + "epoch": 1.1028991873490006, + "grad_norm": 2.6020238399505615, + "learning_rate": 1e-06, + "loss": 0.8896, + "mean_token_accuracy": 0.7194077968597412, + "num_tokens": 254343965.0, + "step": 10043 + }, + { + "epoch": 1.1030090050516144, + "grad_norm": 2.2634499073028564, + "learning_rate": 1e-06, + "loss": 0.7326, + "mean_token_accuracy": 0.7654942274093628, + "num_tokens": 254365918.0, + "step": 10044 + }, + { + "epoch": 1.103118822754228, + "grad_norm": 2.3591840267181396, + "learning_rate": 1e-06, + "loss": 0.8694, + "mean_token_accuracy": 0.7299686074256897, + "num_tokens": 254388683.0, + "step": 10045 + }, + { + "epoch": 1.1032286404568417, + "grad_norm": 2.390143871307373, + "learning_rate": 1e-06, + "loss": 0.8383, + "mean_token_accuracy": 0.7319586277008057, + "num_tokens": 254412879.0, + "step": 10046 + }, + { + "epoch": 1.1033384581594552, + "grad_norm": 2.3629322052001953, + "learning_rate": 1e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.7181440591812134, + "num_tokens": 254437716.0, + "step": 10047 + }, + { + "epoch": 1.103448275862069, + "grad_norm": 2.327378749847412, + "learning_rate": 1e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.7033513784408569, + "num_tokens": 254464855.0, + "step": 10048 + }, + { + "epoch": 1.1035580935646827, + "grad_norm": 2.1939029693603516, + "learning_rate": 1e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.707395076751709, + "num_tokens": 254492280.0, + "step": 10049 + }, + { + "epoch": 1.1036679112672962, + "grad_norm": 2.5253310203552246, + "learning_rate": 1e-06, + "loss": 0.8029, + "mean_token_accuracy": 0.7514082789421082, + "num_tokens": 254512912.0, + "step": 10050 + }, + { + "epoch": 1.10377772896991, + "grad_norm": 2.3009705543518066, + "learning_rate": 1e-06, + "loss": 0.8354, + "mean_token_accuracy": 0.731877326965332, + "num_tokens": 254536248.0, + "step": 10051 + }, + { + "epoch": 1.1038875466725235, + "grad_norm": 2.3583970069885254, + "learning_rate": 1e-06, + "loss": 0.92, + "mean_token_accuracy": 0.7150895595550537, + "num_tokens": 254559136.0, + "step": 10052 + }, + { + "epoch": 1.1039973643751373, + "grad_norm": 2.49373722076416, + "learning_rate": 1e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.707805335521698, + "num_tokens": 254580443.0, + "step": 10053 + }, + { + "epoch": 1.104107182077751, + "grad_norm": 2.0013766288757324, + "learning_rate": 1e-06, + "loss": 0.8831, + "mean_token_accuracy": 0.7233121991157532, + "num_tokens": 254610686.0, + "step": 10054 + }, + { + "epoch": 1.1042169997803646, + "grad_norm": 2.206850528717041, + "learning_rate": 1e-06, + "loss": 0.8653, + "mean_token_accuracy": 0.7396683096885681, + "num_tokens": 254636739.0, + "step": 10055 + }, + { + "epoch": 1.1043268174829783, + "grad_norm": 2.1559817790985107, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7180176973342896, + "num_tokens": 254663355.0, + "step": 10056 + }, + { + "epoch": 1.1044366351855919, + "grad_norm": 2.169550657272339, + "learning_rate": 1e-06, + "loss": 1.0171, + "mean_token_accuracy": 0.7008229494094849, + "num_tokens": 254691429.0, + "step": 10057 + }, + { + "epoch": 1.1045464528882056, + "grad_norm": 2.252051830291748, + "learning_rate": 1e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7107881307601929, + "num_tokens": 254717795.0, + "step": 10058 + }, + { + "epoch": 1.1046562705908192, + "grad_norm": 2.0023176670074463, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7087460160255432, + "num_tokens": 254748885.0, + "step": 10059 + }, + { + "epoch": 1.104766088293433, + "grad_norm": 2.345608949661255, + "learning_rate": 1e-06, + "loss": 0.856, + "mean_token_accuracy": 0.7278077602386475, + "num_tokens": 254770435.0, + "step": 10060 + }, + { + "epoch": 1.1048759059960465, + "grad_norm": 2.445984363555908, + "learning_rate": 1e-06, + "loss": 0.8576, + "mean_token_accuracy": 0.7266941666603088, + "num_tokens": 254792209.0, + "step": 10061 + }, + { + "epoch": 1.1049857236986602, + "grad_norm": 2.2553534507751465, + "learning_rate": 1e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.721146821975708, + "num_tokens": 254815647.0, + "step": 10062 + }, + { + "epoch": 1.105095541401274, + "grad_norm": 2.0315253734588623, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7061710357666016, + "num_tokens": 254846274.0, + "step": 10063 + }, + { + "epoch": 1.1052053591038875, + "grad_norm": 2.286311149597168, + "learning_rate": 1e-06, + "loss": 0.8643, + "mean_token_accuracy": 0.7260400056838989, + "num_tokens": 254871969.0, + "step": 10064 + }, + { + "epoch": 1.1053151768065013, + "grad_norm": 2.422941207885742, + "learning_rate": 1e-06, + "loss": 0.7961, + "mean_token_accuracy": 0.7464674711227417, + "num_tokens": 254892885.0, + "step": 10065 + }, + { + "epoch": 1.1054249945091148, + "grad_norm": 2.4102981090545654, + "learning_rate": 1e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.715106725692749, + "num_tokens": 254915029.0, + "step": 10066 + }, + { + "epoch": 1.1055348122117286, + "grad_norm": 2.357509136199951, + "learning_rate": 1e-06, + "loss": 0.8638, + "mean_token_accuracy": 0.7331416606903076, + "num_tokens": 254935518.0, + "step": 10067 + }, + { + "epoch": 1.105644629914342, + "grad_norm": 1.996910572052002, + "learning_rate": 1e-06, + "loss": 0.8813, + "mean_token_accuracy": 0.725613534450531, + "num_tokens": 254967939.0, + "step": 10068 + }, + { + "epoch": 1.1057544476169558, + "grad_norm": 2.1315195560455322, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.70601487159729, + "num_tokens": 254995475.0, + "step": 10069 + }, + { + "epoch": 1.1058642653195696, + "grad_norm": 2.1499009132385254, + "learning_rate": 1e-06, + "loss": 0.9055, + "mean_token_accuracy": 0.7258697748184204, + "num_tokens": 255023166.0, + "step": 10070 + }, + { + "epoch": 1.1059740830221831, + "grad_norm": 2.090606927871704, + "learning_rate": 1e-06, + "loss": 0.9362, + "mean_token_accuracy": 0.716650128364563, + "num_tokens": 255051462.0, + "step": 10071 + }, + { + "epoch": 1.106083900724797, + "grad_norm": 2.168078660964966, + "learning_rate": 1e-06, + "loss": 0.9066, + "mean_token_accuracy": 0.716986358165741, + "num_tokens": 255077329.0, + "step": 10072 + }, + { + "epoch": 1.1061937184274104, + "grad_norm": 2.0094246864318848, + "learning_rate": 1e-06, + "loss": 0.8581, + "mean_token_accuracy": 0.7295010685920715, + "num_tokens": 255108670.0, + "step": 10073 + }, + { + "epoch": 1.1063035361300242, + "grad_norm": 2.0668272972106934, + "learning_rate": 1e-06, + "loss": 0.8993, + "mean_token_accuracy": 0.7252178192138672, + "num_tokens": 255138981.0, + "step": 10074 + }, + { + "epoch": 1.1064133538326377, + "grad_norm": 2.1112992763519287, + "learning_rate": 1e-06, + "loss": 0.9253, + "mean_token_accuracy": 0.7108795642852783, + "num_tokens": 255167843.0, + "step": 10075 + }, + { + "epoch": 1.1065231715352515, + "grad_norm": 2.4810426235198975, + "learning_rate": 1e-06, + "loss": 0.9029, + "mean_token_accuracy": 0.7224474549293518, + "num_tokens": 255188440.0, + "step": 10076 + }, + { + "epoch": 1.1066329892378652, + "grad_norm": 2.2073917388916016, + "learning_rate": 1e-06, + "loss": 0.8993, + "mean_token_accuracy": 0.7190390825271606, + "num_tokens": 255215654.0, + "step": 10077 + }, + { + "epoch": 1.1067428069404788, + "grad_norm": 2.404008150100708, + "learning_rate": 1e-06, + "loss": 0.8652, + "mean_token_accuracy": 0.7344833016395569, + "num_tokens": 255237458.0, + "step": 10078 + }, + { + "epoch": 1.1068526246430925, + "grad_norm": 2.3950114250183105, + "learning_rate": 1e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7249029278755188, + "num_tokens": 255262404.0, + "step": 10079 + }, + { + "epoch": 1.106962442345706, + "grad_norm": 2.6265063285827637, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7104943990707397, + "num_tokens": 255282263.0, + "step": 10080 + }, + { + "epoch": 1.1070722600483198, + "grad_norm": 2.4901790618896484, + "learning_rate": 1e-06, + "loss": 0.8471, + "mean_token_accuracy": 0.7306473255157471, + "num_tokens": 255303462.0, + "step": 10081 + }, + { + "epoch": 1.1071820777509334, + "grad_norm": 2.557141065597534, + "learning_rate": 1e-06, + "loss": 0.8446, + "mean_token_accuracy": 0.7375686168670654, + "num_tokens": 255323965.0, + "step": 10082 + }, + { + "epoch": 1.1072918954535471, + "grad_norm": 2.2718536853790283, + "learning_rate": 1e-06, + "loss": 0.8835, + "mean_token_accuracy": 0.7227437496185303, + "num_tokens": 255347813.0, + "step": 10083 + }, + { + "epoch": 1.1074017131561609, + "grad_norm": 2.2629168033599854, + "learning_rate": 1e-06, + "loss": 0.8575, + "mean_token_accuracy": 0.7385832071304321, + "num_tokens": 255371902.0, + "step": 10084 + }, + { + "epoch": 1.1075115308587744, + "grad_norm": 2.3069634437561035, + "learning_rate": 1e-06, + "loss": 0.8259, + "mean_token_accuracy": 0.7386283874511719, + "num_tokens": 255395019.0, + "step": 10085 + }, + { + "epoch": 1.1076213485613882, + "grad_norm": 2.149954319000244, + "learning_rate": 1e-06, + "loss": 0.9661, + "mean_token_accuracy": 0.7025971412658691, + "num_tokens": 255422097.0, + "step": 10086 + }, + { + "epoch": 1.1077311662640017, + "grad_norm": 2.3100836277008057, + "learning_rate": 1e-06, + "loss": 0.8902, + "mean_token_accuracy": 0.7202941179275513, + "num_tokens": 255446815.0, + "step": 10087 + }, + { + "epoch": 1.1078409839666155, + "grad_norm": 2.131736993789673, + "learning_rate": 1e-06, + "loss": 0.8235, + "mean_token_accuracy": 0.739633321762085, + "num_tokens": 255472364.0, + "step": 10088 + }, + { + "epoch": 1.107950801669229, + "grad_norm": 2.175971508026123, + "learning_rate": 1e-06, + "loss": 0.888, + "mean_token_accuracy": 0.7189833521842957, + "num_tokens": 255496949.0, + "step": 10089 + }, + { + "epoch": 1.1080606193718427, + "grad_norm": 2.554962635040283, + "learning_rate": 1e-06, + "loss": 0.8699, + "mean_token_accuracy": 0.7235596179962158, + "num_tokens": 255518239.0, + "step": 10090 + }, + { + "epoch": 1.1081704370744565, + "grad_norm": 2.6148195266723633, + "learning_rate": 1e-06, + "loss": 0.7726, + "mean_token_accuracy": 0.7523187398910522, + "num_tokens": 255536024.0, + "step": 10091 + }, + { + "epoch": 1.10828025477707, + "grad_norm": 2.321704149246216, + "learning_rate": 1e-06, + "loss": 0.7989, + "mean_token_accuracy": 0.7432923913002014, + "num_tokens": 255558617.0, + "step": 10092 + }, + { + "epoch": 1.1083900724796838, + "grad_norm": 2.0403943061828613, + "learning_rate": 1e-06, + "loss": 0.8878, + "mean_token_accuracy": 0.7193290591239929, + "num_tokens": 255585478.0, + "step": 10093 + }, + { + "epoch": 1.1084998901822973, + "grad_norm": 2.2036654949188232, + "learning_rate": 1e-06, + "loss": 0.9782, + "mean_token_accuracy": 0.7057145833969116, + "num_tokens": 255612295.0, + "step": 10094 + }, + { + "epoch": 1.108609707884911, + "grad_norm": 2.479389190673828, + "learning_rate": 1e-06, + "loss": 0.9596, + "mean_token_accuracy": 0.7088048458099365, + "num_tokens": 255636468.0, + "step": 10095 + }, + { + "epoch": 1.1087195255875246, + "grad_norm": 2.2261905670166016, + "learning_rate": 1e-06, + "loss": 0.9214, + "mean_token_accuracy": 0.7184842824935913, + "num_tokens": 255662606.0, + "step": 10096 + }, + { + "epoch": 1.1088293432901384, + "grad_norm": 2.3467605113983154, + "learning_rate": 1e-06, + "loss": 0.8555, + "mean_token_accuracy": 0.7347480058670044, + "num_tokens": 255684841.0, + "step": 10097 + }, + { + "epoch": 1.108939160992752, + "grad_norm": 2.2002944946289062, + "learning_rate": 1e-06, + "loss": 1.0164, + "mean_token_accuracy": 0.6977756023406982, + "num_tokens": 255713302.0, + "step": 10098 + }, + { + "epoch": 1.1090489786953657, + "grad_norm": 2.224614381790161, + "learning_rate": 1e-06, + "loss": 0.8995, + "mean_token_accuracy": 0.7257291078567505, + "num_tokens": 255739654.0, + "step": 10099 + }, + { + "epoch": 1.1091587963979794, + "grad_norm": 2.245622158050537, + "learning_rate": 1e-06, + "loss": 0.8733, + "mean_token_accuracy": 0.7243936657905579, + "num_tokens": 255764469.0, + "step": 10100 + }, + { + "epoch": 1.109268614100593, + "grad_norm": 2.224544048309326, + "learning_rate": 1e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.7076584100723267, + "num_tokens": 255789176.0, + "step": 10101 + }, + { + "epoch": 1.1093784318032067, + "grad_norm": 2.0257153511047363, + "learning_rate": 1e-06, + "loss": 1.0149, + "mean_token_accuracy": 0.6905543804168701, + "num_tokens": 255820148.0, + "step": 10102 + }, + { + "epoch": 1.1094882495058203, + "grad_norm": 2.203727960586548, + "learning_rate": 1e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7086312770843506, + "num_tokens": 255847292.0, + "step": 10103 + }, + { + "epoch": 1.109598067208434, + "grad_norm": 2.342209815979004, + "learning_rate": 1e-06, + "loss": 0.8131, + "mean_token_accuracy": 0.7469249963760376, + "num_tokens": 255870255.0, + "step": 10104 + }, + { + "epoch": 1.1097078849110478, + "grad_norm": 2.631452798843384, + "learning_rate": 1e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.7064366340637207, + "num_tokens": 255891556.0, + "step": 10105 + }, + { + "epoch": 1.1098177026136613, + "grad_norm": 2.226799249649048, + "learning_rate": 1e-06, + "loss": 1.0043, + "mean_token_accuracy": 0.6961201429367065, + "num_tokens": 255918090.0, + "step": 10106 + }, + { + "epoch": 1.109927520316275, + "grad_norm": 2.2349321842193604, + "learning_rate": 1e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.7272589206695557, + "num_tokens": 255942120.0, + "step": 10107 + }, + { + "epoch": 1.1100373380188886, + "grad_norm": 2.2997472286224365, + "learning_rate": 1e-06, + "loss": 0.9058, + "mean_token_accuracy": 0.7200571298599243, + "num_tokens": 255966496.0, + "step": 10108 + }, + { + "epoch": 1.1101471557215024, + "grad_norm": 2.449800968170166, + "learning_rate": 1e-06, + "loss": 0.9665, + "mean_token_accuracy": 0.7062389850616455, + "num_tokens": 255994972.0, + "step": 10109 + }, + { + "epoch": 1.110256973424116, + "grad_norm": 2.2019665241241455, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.7101671099662781, + "num_tokens": 256024008.0, + "step": 10110 + }, + { + "epoch": 1.1103667911267296, + "grad_norm": 2.6981515884399414, + "learning_rate": 1e-06, + "loss": 0.7985, + "mean_token_accuracy": 0.7405057549476624, + "num_tokens": 256041765.0, + "step": 10111 + }, + { + "epoch": 1.1104766088293432, + "grad_norm": 2.22627329826355, + "learning_rate": 1e-06, + "loss": 0.8649, + "mean_token_accuracy": 0.7295467853546143, + "num_tokens": 256068402.0, + "step": 10112 + }, + { + "epoch": 1.110586426531957, + "grad_norm": 2.445054769515991, + "learning_rate": 1e-06, + "loss": 0.8637, + "mean_token_accuracy": 0.7251869440078735, + "num_tokens": 256090951.0, + "step": 10113 + }, + { + "epoch": 1.1106962442345707, + "grad_norm": 2.2652339935302734, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7153240442276001, + "num_tokens": 256117471.0, + "step": 10114 + }, + { + "epoch": 1.1108060619371842, + "grad_norm": 2.4032628536224365, + "learning_rate": 1e-06, + "loss": 0.8928, + "mean_token_accuracy": 0.7270200252532959, + "num_tokens": 256140770.0, + "step": 10115 + }, + { + "epoch": 1.110915879639798, + "grad_norm": 2.4517295360565186, + "learning_rate": 1e-06, + "loss": 0.902, + "mean_token_accuracy": 0.7208793759346008, + "num_tokens": 256162674.0, + "step": 10116 + }, + { + "epoch": 1.1110256973424115, + "grad_norm": 2.0372705459594727, + "learning_rate": 1e-06, + "loss": 0.8752, + "mean_token_accuracy": 0.7260102033615112, + "num_tokens": 256193095.0, + "step": 10117 + }, + { + "epoch": 1.1111355150450253, + "grad_norm": 2.2753517627716064, + "learning_rate": 1e-06, + "loss": 0.8688, + "mean_token_accuracy": 0.7360879778862, + "num_tokens": 256218292.0, + "step": 10118 + }, + { + "epoch": 1.111245332747639, + "grad_norm": 2.170267105102539, + "learning_rate": 1e-06, + "loss": 0.8631, + "mean_token_accuracy": 0.7299820184707642, + "num_tokens": 256244114.0, + "step": 10119 + }, + { + "epoch": 1.1113551504502526, + "grad_norm": 2.1456077098846436, + "learning_rate": 1e-06, + "loss": 0.8227, + "mean_token_accuracy": 0.732854425907135, + "num_tokens": 256272465.0, + "step": 10120 + }, + { + "epoch": 1.1114649681528663, + "grad_norm": 2.052234172821045, + "learning_rate": 1e-06, + "loss": 0.8883, + "mean_token_accuracy": 0.7267086505889893, + "num_tokens": 256300735.0, + "step": 10121 + }, + { + "epoch": 1.1115747858554799, + "grad_norm": 2.50632381439209, + "learning_rate": 1e-06, + "loss": 0.9766, + "mean_token_accuracy": 0.699589729309082, + "num_tokens": 256326045.0, + "step": 10122 + }, + { + "epoch": 1.1116846035580936, + "grad_norm": 2.485654354095459, + "learning_rate": 1e-06, + "loss": 0.8614, + "mean_token_accuracy": 0.7424036264419556, + "num_tokens": 256345842.0, + "step": 10123 + }, + { + "epoch": 1.1117944212607072, + "grad_norm": 2.1745193004608154, + "learning_rate": 1e-06, + "loss": 0.986, + "mean_token_accuracy": 0.697052001953125, + "num_tokens": 256373771.0, + "step": 10124 + }, + { + "epoch": 1.111904238963321, + "grad_norm": 1.9075895547866821, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7183618545532227, + "num_tokens": 256407051.0, + "step": 10125 + }, + { + "epoch": 1.1120140566659344, + "grad_norm": 2.027397871017456, + "learning_rate": 1e-06, + "loss": 0.8844, + "mean_token_accuracy": 0.7224120497703552, + "num_tokens": 256435526.0, + "step": 10126 + }, + { + "epoch": 1.1121238743685482, + "grad_norm": 2.2177510261535645, + "learning_rate": 1e-06, + "loss": 0.8688, + "mean_token_accuracy": 0.7307523488998413, + "num_tokens": 256460222.0, + "step": 10127 + }, + { + "epoch": 1.112233692071162, + "grad_norm": 2.3815715312957764, + "learning_rate": 1e-06, + "loss": 0.8692, + "mean_token_accuracy": 0.726022481918335, + "num_tokens": 256484495.0, + "step": 10128 + }, + { + "epoch": 1.1123435097737755, + "grad_norm": 2.291273832321167, + "learning_rate": 1e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.6942936182022095, + "num_tokens": 256513472.0, + "step": 10129 + }, + { + "epoch": 1.1124533274763893, + "grad_norm": 2.203857660293579, + "learning_rate": 1e-06, + "loss": 0.8485, + "mean_token_accuracy": 0.7329427003860474, + "num_tokens": 256538019.0, + "step": 10130 + }, + { + "epoch": 1.1125631451790028, + "grad_norm": 2.3396899700164795, + "learning_rate": 1e-06, + "loss": 0.9085, + "mean_token_accuracy": 0.7157785892486572, + "num_tokens": 256561711.0, + "step": 10131 + }, + { + "epoch": 1.1126729628816165, + "grad_norm": 2.4985110759735107, + "learning_rate": 1e-06, + "loss": 0.8776, + "mean_token_accuracy": 0.7260627150535583, + "num_tokens": 256582349.0, + "step": 10132 + }, + { + "epoch": 1.11278278058423, + "grad_norm": 2.4556264877319336, + "learning_rate": 1e-06, + "loss": 0.9117, + "mean_token_accuracy": 0.7153682708740234, + "num_tokens": 256603547.0, + "step": 10133 + }, + { + "epoch": 1.1128925982868438, + "grad_norm": 2.4249863624572754, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7117374539375305, + "num_tokens": 256626426.0, + "step": 10134 + }, + { + "epoch": 1.1130024159894576, + "grad_norm": 2.3566339015960693, + "learning_rate": 1e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.7339211702346802, + "num_tokens": 256649904.0, + "step": 10135 + }, + { + "epoch": 1.1131122336920711, + "grad_norm": 2.4516761302948, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7172102332115173, + "num_tokens": 256671708.0, + "step": 10136 + }, + { + "epoch": 1.1132220513946849, + "grad_norm": 2.4251630306243896, + "learning_rate": 1e-06, + "loss": 0.9035, + "mean_token_accuracy": 0.7278542518615723, + "num_tokens": 256694335.0, + "step": 10137 + }, + { + "epoch": 1.1133318690972984, + "grad_norm": 2.2449522018432617, + "learning_rate": 1e-06, + "loss": 1.0423, + "mean_token_accuracy": 0.6961237192153931, + "num_tokens": 256719143.0, + "step": 10138 + }, + { + "epoch": 1.1134416867999122, + "grad_norm": 2.530611515045166, + "learning_rate": 1e-06, + "loss": 0.9075, + "mean_token_accuracy": 0.7242181897163391, + "num_tokens": 256742073.0, + "step": 10139 + }, + { + "epoch": 1.1135515045025257, + "grad_norm": 2.1606879234313965, + "learning_rate": 1e-06, + "loss": 0.8387, + "mean_token_accuracy": 0.7373029589653015, + "num_tokens": 256768208.0, + "step": 10140 + }, + { + "epoch": 1.1136613222051395, + "grad_norm": 2.6809449195861816, + "learning_rate": 1e-06, + "loss": 0.873, + "mean_token_accuracy": 0.725852370262146, + "num_tokens": 256789476.0, + "step": 10141 + }, + { + "epoch": 1.1137711399077532, + "grad_norm": 2.4340405464172363, + "learning_rate": 1e-06, + "loss": 0.8408, + "mean_token_accuracy": 0.7408173680305481, + "num_tokens": 256811486.0, + "step": 10142 + }, + { + "epoch": 1.1138809576103668, + "grad_norm": 2.563861131668091, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.7189435958862305, + "num_tokens": 256833515.0, + "step": 10143 + }, + { + "epoch": 1.1139907753129805, + "grad_norm": 2.393596887588501, + "learning_rate": 1e-06, + "loss": 0.8721, + "mean_token_accuracy": 0.7309492826461792, + "num_tokens": 256856431.0, + "step": 10144 + }, + { + "epoch": 1.114100593015594, + "grad_norm": 2.746109962463379, + "learning_rate": 1e-06, + "loss": 0.7689, + "mean_token_accuracy": 0.7488507628440857, + "num_tokens": 256873516.0, + "step": 10145 + }, + { + "epoch": 1.1142104107182078, + "grad_norm": 2.1798269748687744, + "learning_rate": 1e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.7098168134689331, + "num_tokens": 256902169.0, + "step": 10146 + }, + { + "epoch": 1.1143202284208213, + "grad_norm": 2.183007001876831, + "learning_rate": 1e-06, + "loss": 0.8968, + "mean_token_accuracy": 0.7156399488449097, + "num_tokens": 256926197.0, + "step": 10147 + }, + { + "epoch": 1.114430046123435, + "grad_norm": 2.158374786376953, + "learning_rate": 1e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.7187491655349731, + "num_tokens": 256954579.0, + "step": 10148 + }, + { + "epoch": 1.1145398638260489, + "grad_norm": 2.2526490688323975, + "learning_rate": 1e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7186583876609802, + "num_tokens": 256979740.0, + "step": 10149 + }, + { + "epoch": 1.1146496815286624, + "grad_norm": 2.262655735015869, + "learning_rate": 1e-06, + "loss": 0.8399, + "mean_token_accuracy": 0.7370972633361816, + "num_tokens": 257002914.0, + "step": 10150 + }, + { + "epoch": 1.1147594992312762, + "grad_norm": 2.2737350463867188, + "learning_rate": 1e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.6920789480209351, + "num_tokens": 257029449.0, + "step": 10151 + }, + { + "epoch": 1.1148693169338897, + "grad_norm": 2.25120210647583, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7204873561859131, + "num_tokens": 257057160.0, + "step": 10152 + }, + { + "epoch": 1.1149791346365034, + "grad_norm": 2.473421573638916, + "learning_rate": 1e-06, + "loss": 0.884, + "mean_token_accuracy": 0.7217809557914734, + "num_tokens": 257080166.0, + "step": 10153 + }, + { + "epoch": 1.115088952339117, + "grad_norm": 2.1735732555389404, + "learning_rate": 1e-06, + "loss": 1.0527, + "mean_token_accuracy": 0.6892362833023071, + "num_tokens": 257109104.0, + "step": 10154 + }, + { + "epoch": 1.1151987700417307, + "grad_norm": 2.360583782196045, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7141537666320801, + "num_tokens": 257133258.0, + "step": 10155 + }, + { + "epoch": 1.1153085877443445, + "grad_norm": 2.243712902069092, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7051974534988403, + "num_tokens": 257160572.0, + "step": 10156 + }, + { + "epoch": 1.115418405446958, + "grad_norm": 2.5975637435913086, + "learning_rate": 1e-06, + "loss": 0.8759, + "mean_token_accuracy": 0.7281684875488281, + "num_tokens": 257182996.0, + "step": 10157 + }, + { + "epoch": 1.1155282231495718, + "grad_norm": 2.372825860977173, + "learning_rate": 1e-06, + "loss": 0.8797, + "mean_token_accuracy": 0.7334357500076294, + "num_tokens": 257205323.0, + "step": 10158 + }, + { + "epoch": 1.1156380408521853, + "grad_norm": 2.3640756607055664, + "learning_rate": 1e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.7197108268737793, + "num_tokens": 257228420.0, + "step": 10159 + }, + { + "epoch": 1.115747858554799, + "grad_norm": 2.044609308242798, + "learning_rate": 1e-06, + "loss": 0.9246, + "mean_token_accuracy": 0.7112827301025391, + "num_tokens": 257256498.0, + "step": 10160 + }, + { + "epoch": 1.1158576762574126, + "grad_norm": 2.5763821601867676, + "learning_rate": 1e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.7160553932189941, + "num_tokens": 257277539.0, + "step": 10161 + }, + { + "epoch": 1.1159674939600264, + "grad_norm": 2.548692464828491, + "learning_rate": 1e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7233798503875732, + "num_tokens": 257299354.0, + "step": 10162 + }, + { + "epoch": 1.11607731166264, + "grad_norm": 2.2326743602752686, + "learning_rate": 1e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.702436089515686, + "num_tokens": 257326731.0, + "step": 10163 + }, + { + "epoch": 1.1161871293652537, + "grad_norm": 2.0254952907562256, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7117239832878113, + "num_tokens": 257357424.0, + "step": 10164 + }, + { + "epoch": 1.1162969470678674, + "grad_norm": 2.231389045715332, + "learning_rate": 1e-06, + "loss": 0.9002, + "mean_token_accuracy": 0.7178182601928711, + "num_tokens": 257383023.0, + "step": 10165 + }, + { + "epoch": 1.116406764770481, + "grad_norm": 2.1984477043151855, + "learning_rate": 1e-06, + "loss": 0.94, + "mean_token_accuracy": 0.711923360824585, + "num_tokens": 257410508.0, + "step": 10166 + }, + { + "epoch": 1.1165165824730947, + "grad_norm": 2.3597471714019775, + "learning_rate": 1e-06, + "loss": 0.9534, + "mean_token_accuracy": 0.7051483392715454, + "num_tokens": 257432252.0, + "step": 10167 + }, + { + "epoch": 1.1166264001757082, + "grad_norm": 2.3958239555358887, + "learning_rate": 1e-06, + "loss": 0.9079, + "mean_token_accuracy": 0.7184785604476929, + "num_tokens": 257456161.0, + "step": 10168 + }, + { + "epoch": 1.116736217878322, + "grad_norm": 2.3027524948120117, + "learning_rate": 1e-06, + "loss": 1.0222, + "mean_token_accuracy": 0.6852318048477173, + "num_tokens": 257483691.0, + "step": 10169 + }, + { + "epoch": 1.1168460355809358, + "grad_norm": 2.264989137649536, + "learning_rate": 1e-06, + "loss": 0.8901, + "mean_token_accuracy": 0.7298768162727356, + "num_tokens": 257510983.0, + "step": 10170 + }, + { + "epoch": 1.1169558532835493, + "grad_norm": 2.208202600479126, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7070299386978149, + "num_tokens": 257538439.0, + "step": 10171 + }, + { + "epoch": 1.117065670986163, + "grad_norm": 2.615405321121216, + "learning_rate": 1e-06, + "loss": 0.819, + "mean_token_accuracy": 0.7438269257545471, + "num_tokens": 257558581.0, + "step": 10172 + }, + { + "epoch": 1.1171754886887766, + "grad_norm": 2.5789225101470947, + "learning_rate": 1e-06, + "loss": 0.8969, + "mean_token_accuracy": 0.7179805040359497, + "num_tokens": 257581053.0, + "step": 10173 + }, + { + "epoch": 1.1172853063913903, + "grad_norm": 2.3507769107818604, + "learning_rate": 1e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7020602822303772, + "num_tokens": 257606766.0, + "step": 10174 + }, + { + "epoch": 1.1173951240940039, + "grad_norm": 2.295600652694702, + "learning_rate": 1e-06, + "loss": 0.9133, + "mean_token_accuracy": 0.7135694026947021, + "num_tokens": 257633552.0, + "step": 10175 + }, + { + "epoch": 1.1175049417966176, + "grad_norm": 2.117384195327759, + "learning_rate": 1e-06, + "loss": 0.8507, + "mean_token_accuracy": 0.7316669225692749, + "num_tokens": 257660595.0, + "step": 10176 + }, + { + "epoch": 1.1176147594992312, + "grad_norm": 2.5306668281555176, + "learning_rate": 1e-06, + "loss": 0.8189, + "mean_token_accuracy": 0.7447283267974854, + "num_tokens": 257682160.0, + "step": 10177 + }, + { + "epoch": 1.117724577201845, + "grad_norm": 2.475478172302246, + "learning_rate": 1e-06, + "loss": 0.8485, + "mean_token_accuracy": 0.732589066028595, + "num_tokens": 257704556.0, + "step": 10178 + }, + { + "epoch": 1.1178343949044587, + "grad_norm": 2.373511552810669, + "learning_rate": 1e-06, + "loss": 0.8429, + "mean_token_accuracy": 0.7392216920852661, + "num_tokens": 257728933.0, + "step": 10179 + }, + { + "epoch": 1.1179442126070722, + "grad_norm": 2.18587064743042, + "learning_rate": 1e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.702713668346405, + "num_tokens": 257757827.0, + "step": 10180 + }, + { + "epoch": 1.118054030309686, + "grad_norm": 2.364366054534912, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7103369235992432, + "num_tokens": 257783664.0, + "step": 10181 + }, + { + "epoch": 1.1181638480122995, + "grad_norm": 2.484851360321045, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.7102710008621216, + "num_tokens": 257805415.0, + "step": 10182 + }, + { + "epoch": 1.1182736657149133, + "grad_norm": 2.0503222942352295, + "learning_rate": 1e-06, + "loss": 0.8528, + "mean_token_accuracy": 0.7299754619598389, + "num_tokens": 257834294.0, + "step": 10183 + }, + { + "epoch": 1.1183834834175268, + "grad_norm": 2.4347574710845947, + "learning_rate": 1e-06, + "loss": 0.9471, + "mean_token_accuracy": 0.7055723667144775, + "num_tokens": 257856827.0, + "step": 10184 + }, + { + "epoch": 1.1184933011201406, + "grad_norm": 2.301551342010498, + "learning_rate": 1e-06, + "loss": 0.9266, + "mean_token_accuracy": 0.7194881439208984, + "num_tokens": 257880214.0, + "step": 10185 + }, + { + "epoch": 1.1186031188227543, + "grad_norm": 2.374403715133667, + "learning_rate": 1e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.7156846523284912, + "num_tokens": 257904528.0, + "step": 10186 + }, + { + "epoch": 1.1187129365253679, + "grad_norm": 2.364755868911743, + "learning_rate": 1e-06, + "loss": 0.9884, + "mean_token_accuracy": 0.6918562054634094, + "num_tokens": 257930280.0, + "step": 10187 + }, + { + "epoch": 1.1188227542279816, + "grad_norm": 2.6121671199798584, + "learning_rate": 1e-06, + "loss": 0.8397, + "mean_token_accuracy": 0.7318029403686523, + "num_tokens": 257950679.0, + "step": 10188 + }, + { + "epoch": 1.1189325719305951, + "grad_norm": 2.0996084213256836, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7048647999763489, + "num_tokens": 257978989.0, + "step": 10189 + }, + { + "epoch": 1.119042389633209, + "grad_norm": 2.30678391456604, + "learning_rate": 1e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7150125503540039, + "num_tokens": 258002864.0, + "step": 10190 + }, + { + "epoch": 1.1191522073358224, + "grad_norm": 2.233591079711914, + "learning_rate": 1e-06, + "loss": 0.879, + "mean_token_accuracy": 0.723737359046936, + "num_tokens": 258027387.0, + "step": 10191 + }, + { + "epoch": 1.1192620250384362, + "grad_norm": 2.5163018703460693, + "learning_rate": 1e-06, + "loss": 0.8731, + "mean_token_accuracy": 0.7217224836349487, + "num_tokens": 258047438.0, + "step": 10192 + }, + { + "epoch": 1.11937184274105, + "grad_norm": 2.4473423957824707, + "learning_rate": 1e-06, + "loss": 0.8272, + "mean_token_accuracy": 0.7404301166534424, + "num_tokens": 258069050.0, + "step": 10193 + }, + { + "epoch": 1.1194816604436635, + "grad_norm": 2.3356716632843018, + "learning_rate": 1e-06, + "loss": 0.8701, + "mean_token_accuracy": 0.7245621681213379, + "num_tokens": 258092158.0, + "step": 10194 + }, + { + "epoch": 1.1195914781462772, + "grad_norm": 2.1572024822235107, + "learning_rate": 1e-06, + "loss": 0.9612, + "mean_token_accuracy": 0.6987649202346802, + "num_tokens": 258120853.0, + "step": 10195 + }, + { + "epoch": 1.1197012958488908, + "grad_norm": 2.4052066802978516, + "learning_rate": 1e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.716850996017456, + "num_tokens": 258143148.0, + "step": 10196 + }, + { + "epoch": 1.1198111135515045, + "grad_norm": 2.4409537315368652, + "learning_rate": 1e-06, + "loss": 0.8433, + "mean_token_accuracy": 0.7324712872505188, + "num_tokens": 258166892.0, + "step": 10197 + }, + { + "epoch": 1.119920931254118, + "grad_norm": 2.2104058265686035, + "learning_rate": 1e-06, + "loss": 0.9064, + "mean_token_accuracy": 0.7260099053382874, + "num_tokens": 258193268.0, + "step": 10198 + }, + { + "epoch": 1.1200307489567318, + "grad_norm": 2.178988218307495, + "learning_rate": 1e-06, + "loss": 0.9455, + "mean_token_accuracy": 0.7042073011398315, + "num_tokens": 258221890.0, + "step": 10199 + }, + { + "epoch": 1.1201405666593456, + "grad_norm": 2.452496290206909, + "learning_rate": 1e-06, + "loss": 0.8212, + "mean_token_accuracy": 0.7410699129104614, + "num_tokens": 258242648.0, + "step": 10200 + }, + { + "epoch": 1.1202503843619591, + "grad_norm": 2.0336809158325195, + "learning_rate": 1e-06, + "loss": 1.0169, + "mean_token_accuracy": 0.6850671172142029, + "num_tokens": 258275886.0, + "step": 10201 + }, + { + "epoch": 1.1203602020645729, + "grad_norm": 2.2744662761688232, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7065399289131165, + "num_tokens": 258304390.0, + "step": 10202 + }, + { + "epoch": 1.1204700197671864, + "grad_norm": 2.3988258838653564, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.7085191607475281, + "num_tokens": 258331777.0, + "step": 10203 + }, + { + "epoch": 1.1205798374698002, + "grad_norm": 2.3271303176879883, + "learning_rate": 1e-06, + "loss": 0.8789, + "mean_token_accuracy": 0.723504364490509, + "num_tokens": 258356735.0, + "step": 10204 + }, + { + "epoch": 1.1206896551724137, + "grad_norm": 2.118715763092041, + "learning_rate": 1e-06, + "loss": 1.0047, + "mean_token_accuracy": 0.6988219618797302, + "num_tokens": 258385987.0, + "step": 10205 + }, + { + "epoch": 1.1207994728750275, + "grad_norm": 2.116297721862793, + "learning_rate": 1e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7093911170959473, + "num_tokens": 258415686.0, + "step": 10206 + }, + { + "epoch": 1.1209092905776412, + "grad_norm": 2.551645278930664, + "learning_rate": 1e-06, + "loss": 0.8546, + "mean_token_accuracy": 0.7434897422790527, + "num_tokens": 258435915.0, + "step": 10207 + }, + { + "epoch": 1.1210191082802548, + "grad_norm": 2.36808705329895, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.7020845413208008, + "num_tokens": 258459857.0, + "step": 10208 + }, + { + "epoch": 1.1211289259828685, + "grad_norm": 2.2508561611175537, + "learning_rate": 1e-06, + "loss": 0.9679, + "mean_token_accuracy": 0.7092045545578003, + "num_tokens": 258487719.0, + "step": 10209 + }, + { + "epoch": 1.121238743685482, + "grad_norm": 2.492098093032837, + "learning_rate": 1e-06, + "loss": 0.8854, + "mean_token_accuracy": 0.7205660939216614, + "num_tokens": 258509710.0, + "step": 10210 + }, + { + "epoch": 1.1213485613880958, + "grad_norm": 2.2171027660369873, + "learning_rate": 1e-06, + "loss": 0.8875, + "mean_token_accuracy": 0.7207503318786621, + "num_tokens": 258535616.0, + "step": 10211 + }, + { + "epoch": 1.1214583790907093, + "grad_norm": 2.1931257247924805, + "learning_rate": 1e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.7114492058753967, + "num_tokens": 258562225.0, + "step": 10212 + }, + { + "epoch": 1.121568196793323, + "grad_norm": 2.3287646770477295, + "learning_rate": 1e-06, + "loss": 0.8556, + "mean_token_accuracy": 0.7312887907028198, + "num_tokens": 258584500.0, + "step": 10213 + }, + { + "epoch": 1.1216780144959368, + "grad_norm": 2.0449726581573486, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7106671333312988, + "num_tokens": 258614167.0, + "step": 10214 + }, + { + "epoch": 1.1217878321985504, + "grad_norm": 2.4085142612457275, + "learning_rate": 1e-06, + "loss": 0.8772, + "mean_token_accuracy": 0.720782995223999, + "num_tokens": 258637065.0, + "step": 10215 + }, + { + "epoch": 1.1218976499011641, + "grad_norm": 2.2225587368011475, + "learning_rate": 1e-06, + "loss": 0.94, + "mean_token_accuracy": 0.7000982761383057, + "num_tokens": 258662225.0, + "step": 10216 + }, + { + "epoch": 1.1220074676037777, + "grad_norm": 2.6488685607910156, + "learning_rate": 1e-06, + "loss": 0.8632, + "mean_token_accuracy": 0.7327353358268738, + "num_tokens": 258680155.0, + "step": 10217 + }, + { + "epoch": 1.1221172853063914, + "grad_norm": 2.1825032234191895, + "learning_rate": 1e-06, + "loss": 1.0005, + "mean_token_accuracy": 0.6911563873291016, + "num_tokens": 258710484.0, + "step": 10218 + }, + { + "epoch": 1.122227103009005, + "grad_norm": 2.3582382202148438, + "learning_rate": 1e-06, + "loss": 0.8305, + "mean_token_accuracy": 0.7370958924293518, + "num_tokens": 258732632.0, + "step": 10219 + }, + { + "epoch": 1.1223369207116187, + "grad_norm": 2.1690616607666016, + "learning_rate": 1e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.7043598890304565, + "num_tokens": 258758607.0, + "step": 10220 + }, + { + "epoch": 1.1224467384142325, + "grad_norm": 2.1343870162963867, + "learning_rate": 1e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.7138679027557373, + "num_tokens": 258784830.0, + "step": 10221 + }, + { + "epoch": 1.122556556116846, + "grad_norm": 2.1121890544891357, + "learning_rate": 1e-06, + "loss": 1.0476, + "mean_token_accuracy": 0.6805825233459473, + "num_tokens": 258816066.0, + "step": 10222 + }, + { + "epoch": 1.1226663738194598, + "grad_norm": 1.8272916078567505, + "learning_rate": 1e-06, + "loss": 0.9138, + "mean_token_accuracy": 0.7176364660263062, + "num_tokens": 258850171.0, + "step": 10223 + }, + { + "epoch": 1.1227761915220733, + "grad_norm": 2.23171067237854, + "learning_rate": 1e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.7010651230812073, + "num_tokens": 258874919.0, + "step": 10224 + }, + { + "epoch": 1.122886009224687, + "grad_norm": 2.1271114349365234, + "learning_rate": 1e-06, + "loss": 0.9746, + "mean_token_accuracy": 0.7090708017349243, + "num_tokens": 258904890.0, + "step": 10225 + }, + { + "epoch": 1.1229958269273006, + "grad_norm": 2.172257661819458, + "learning_rate": 1e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.6979345679283142, + "num_tokens": 258935755.0, + "step": 10226 + }, + { + "epoch": 1.1231056446299144, + "grad_norm": 2.6021697521209717, + "learning_rate": 1e-06, + "loss": 0.8782, + "mean_token_accuracy": 0.7242746353149414, + "num_tokens": 258955094.0, + "step": 10227 + }, + { + "epoch": 1.123215462332528, + "grad_norm": 2.168440580368042, + "learning_rate": 1e-06, + "loss": 0.8833, + "mean_token_accuracy": 0.7238328456878662, + "num_tokens": 258982681.0, + "step": 10228 + }, + { + "epoch": 1.1233252800351416, + "grad_norm": 2.1433472633361816, + "learning_rate": 1e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7187250852584839, + "num_tokens": 259010015.0, + "step": 10229 + }, + { + "epoch": 1.1234350977377554, + "grad_norm": 2.1877567768096924, + "learning_rate": 1e-06, + "loss": 0.8592, + "mean_token_accuracy": 0.7249028086662292, + "num_tokens": 259035293.0, + "step": 10230 + }, + { + "epoch": 1.123544915440369, + "grad_norm": 2.147880792617798, + "learning_rate": 1e-06, + "loss": 1.0194, + "mean_token_accuracy": 0.693468451499939, + "num_tokens": 259064990.0, + "step": 10231 + }, + { + "epoch": 1.1236547331429827, + "grad_norm": 2.6667838096618652, + "learning_rate": 1e-06, + "loss": 0.9073, + "mean_token_accuracy": 0.7224977016448975, + "num_tokens": 259085279.0, + "step": 10232 + }, + { + "epoch": 1.1237645508455962, + "grad_norm": 2.154383420944214, + "learning_rate": 1e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.7170494794845581, + "num_tokens": 259110994.0, + "step": 10233 + }, + { + "epoch": 1.12387436854821, + "grad_norm": 2.290601968765259, + "learning_rate": 1e-06, + "loss": 0.9218, + "mean_token_accuracy": 0.7161319255828857, + "num_tokens": 259135087.0, + "step": 10234 + }, + { + "epoch": 1.1239841862508237, + "grad_norm": 2.5457711219787598, + "learning_rate": 1e-06, + "loss": 0.8819, + "mean_token_accuracy": 0.726901650428772, + "num_tokens": 259156508.0, + "step": 10235 + }, + { + "epoch": 1.1240940039534373, + "grad_norm": 2.4643394947052, + "learning_rate": 1e-06, + "loss": 0.8668, + "mean_token_accuracy": 0.7350976467132568, + "num_tokens": 259176867.0, + "step": 10236 + }, + { + "epoch": 1.124203821656051, + "grad_norm": 2.369279384613037, + "learning_rate": 1e-06, + "loss": 0.8436, + "mean_token_accuracy": 0.732500433921814, + "num_tokens": 259199852.0, + "step": 10237 + }, + { + "epoch": 1.1243136393586646, + "grad_norm": 2.3526134490966797, + "learning_rate": 1e-06, + "loss": 0.9327, + "mean_token_accuracy": 0.7164044976234436, + "num_tokens": 259223046.0, + "step": 10238 + }, + { + "epoch": 1.1244234570612783, + "grad_norm": 2.298492908477783, + "learning_rate": 1e-06, + "loss": 1.0269, + "mean_token_accuracy": 0.6983038187026978, + "num_tokens": 259248548.0, + "step": 10239 + }, + { + "epoch": 1.1245332747638919, + "grad_norm": 2.1567869186401367, + "learning_rate": 1e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7038742899894714, + "num_tokens": 259278393.0, + "step": 10240 + }, + { + "epoch": 1.1246430924665056, + "grad_norm": 2.0895395278930664, + "learning_rate": 1e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.715558648109436, + "num_tokens": 259306917.0, + "step": 10241 + }, + { + "epoch": 1.1247529101691192, + "grad_norm": 2.2185308933258057, + "learning_rate": 1e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.711624026298523, + "num_tokens": 259335686.0, + "step": 10242 + }, + { + "epoch": 1.124862727871733, + "grad_norm": 2.140751600265503, + "learning_rate": 1e-06, + "loss": 0.8634, + "mean_token_accuracy": 0.7313657999038696, + "num_tokens": 259363014.0, + "step": 10243 + }, + { + "epoch": 1.1249725455743467, + "grad_norm": 2.3312175273895264, + "learning_rate": 1e-06, + "loss": 0.8917, + "mean_token_accuracy": 0.7244014739990234, + "num_tokens": 259387895.0, + "step": 10244 + }, + { + "epoch": 1.1250823632769602, + "grad_norm": 2.4266774654388428, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7103384733200073, + "num_tokens": 259411221.0, + "step": 10245 + }, + { + "epoch": 1.125192180979574, + "grad_norm": 2.20540452003479, + "learning_rate": 1e-06, + "loss": 1.0154, + "mean_token_accuracy": 0.6873255968093872, + "num_tokens": 259441260.0, + "step": 10246 + }, + { + "epoch": 1.1253019986821875, + "grad_norm": 2.48606538772583, + "learning_rate": 1e-06, + "loss": 0.9073, + "mean_token_accuracy": 0.7263997793197632, + "num_tokens": 259463123.0, + "step": 10247 + }, + { + "epoch": 1.1254118163848013, + "grad_norm": 2.1847126483917236, + "learning_rate": 1e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.716808021068573, + "num_tokens": 259489306.0, + "step": 10248 + }, + { + "epoch": 1.125521634087415, + "grad_norm": 2.3348701000213623, + "learning_rate": 1e-06, + "loss": 0.9138, + "mean_token_accuracy": 0.7191662788391113, + "num_tokens": 259515498.0, + "step": 10249 + }, + { + "epoch": 1.1256314517900285, + "grad_norm": 2.538433074951172, + "learning_rate": 1e-06, + "loss": 0.8237, + "mean_token_accuracy": 0.7408837080001831, + "num_tokens": 259535516.0, + "step": 10250 + }, + { + "epoch": 1.1257412694926423, + "grad_norm": 2.2609879970550537, + "learning_rate": 1e-06, + "loss": 0.9801, + "mean_token_accuracy": 0.6993152499198914, + "num_tokens": 259561865.0, + "step": 10251 + }, + { + "epoch": 1.1258510871952558, + "grad_norm": 2.1761317253112793, + "learning_rate": 1e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.7174830436706543, + "num_tokens": 259589442.0, + "step": 10252 + }, + { + "epoch": 1.1259609048978696, + "grad_norm": 2.0990242958068848, + "learning_rate": 1e-06, + "loss": 0.8687, + "mean_token_accuracy": 0.7305941581726074, + "num_tokens": 259616777.0, + "step": 10253 + }, + { + "epoch": 1.1260707226004831, + "grad_norm": 2.2878661155700684, + "learning_rate": 1e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7204052209854126, + "num_tokens": 259641553.0, + "step": 10254 + }, + { + "epoch": 1.1261805403030969, + "grad_norm": 2.1229331493377686, + "learning_rate": 1e-06, + "loss": 0.957, + "mean_token_accuracy": 0.710462749004364, + "num_tokens": 259670876.0, + "step": 10255 + }, + { + "epoch": 1.1262903580057104, + "grad_norm": 2.5440750122070312, + "learning_rate": 1e-06, + "loss": 0.8122, + "mean_token_accuracy": 0.7414180040359497, + "num_tokens": 259691938.0, + "step": 10256 + }, + { + "epoch": 1.1264001757083242, + "grad_norm": 2.2174017429351807, + "learning_rate": 1e-06, + "loss": 0.949, + "mean_token_accuracy": 0.7118839025497437, + "num_tokens": 259718467.0, + "step": 10257 + }, + { + "epoch": 1.126509993410938, + "grad_norm": 2.227945327758789, + "learning_rate": 1e-06, + "loss": 0.9056, + "mean_token_accuracy": 0.7149876356124878, + "num_tokens": 259744782.0, + "step": 10258 + }, + { + "epoch": 1.1266198111135515, + "grad_norm": 2.2194292545318604, + "learning_rate": 1e-06, + "loss": 0.9857, + "mean_token_accuracy": 0.6934142708778381, + "num_tokens": 259773035.0, + "step": 10259 + }, + { + "epoch": 1.1267296288161652, + "grad_norm": 2.2132492065429688, + "learning_rate": 1e-06, + "loss": 0.9525, + "mean_token_accuracy": 0.7060478925704956, + "num_tokens": 259799424.0, + "step": 10260 + }, + { + "epoch": 1.1268394465187788, + "grad_norm": 2.1692087650299072, + "learning_rate": 1e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.7138235569000244, + "num_tokens": 259826820.0, + "step": 10261 + }, + { + "epoch": 1.1269492642213925, + "grad_norm": 2.324448347091675, + "learning_rate": 1e-06, + "loss": 0.8588, + "mean_token_accuracy": 0.7310022711753845, + "num_tokens": 259850785.0, + "step": 10262 + }, + { + "epoch": 1.127059081924006, + "grad_norm": 2.5856106281280518, + "learning_rate": 1e-06, + "loss": 0.8737, + "mean_token_accuracy": 0.7276979684829712, + "num_tokens": 259870474.0, + "step": 10263 + }, + { + "epoch": 1.1271688996266198, + "grad_norm": 2.505628824234009, + "learning_rate": 1e-06, + "loss": 0.8712, + "mean_token_accuracy": 0.7228111624717712, + "num_tokens": 259892637.0, + "step": 10264 + }, + { + "epoch": 1.1272787173292333, + "grad_norm": 2.2387001514434814, + "learning_rate": 1e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.7055197954177856, + "num_tokens": 259919245.0, + "step": 10265 + }, + { + "epoch": 1.127388535031847, + "grad_norm": 2.1281192302703857, + "learning_rate": 1e-06, + "loss": 0.7749, + "mean_token_accuracy": 0.7514019012451172, + "num_tokens": 259944018.0, + "step": 10266 + }, + { + "epoch": 1.1274983527344609, + "grad_norm": 2.265896797180176, + "learning_rate": 1e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.711020290851593, + "num_tokens": 259970593.0, + "step": 10267 + }, + { + "epoch": 1.1276081704370744, + "grad_norm": 2.6547207832336426, + "learning_rate": 1e-06, + "loss": 0.8619, + "mean_token_accuracy": 0.7338652014732361, + "num_tokens": 259990958.0, + "step": 10268 + }, + { + "epoch": 1.1277179881396882, + "grad_norm": 3.129992723464966, + "learning_rate": 1e-06, + "loss": 0.9122, + "mean_token_accuracy": 0.711883544921875, + "num_tokens": 260009878.0, + "step": 10269 + }, + { + "epoch": 1.1278278058423017, + "grad_norm": 2.4090142250061035, + "learning_rate": 1e-06, + "loss": 0.8782, + "mean_token_accuracy": 0.7227942943572998, + "num_tokens": 260033057.0, + "step": 10270 + }, + { + "epoch": 1.1279376235449154, + "grad_norm": 2.194532632827759, + "learning_rate": 1e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7079339623451233, + "num_tokens": 260058757.0, + "step": 10271 + }, + { + "epoch": 1.1280474412475292, + "grad_norm": 2.147803783416748, + "learning_rate": 1e-06, + "loss": 0.922, + "mean_token_accuracy": 0.7161273956298828, + "num_tokens": 260089027.0, + "step": 10272 + }, + { + "epoch": 1.1281572589501427, + "grad_norm": 2.255786895751953, + "learning_rate": 1e-06, + "loss": 0.9127, + "mean_token_accuracy": 0.7158597707748413, + "num_tokens": 260113373.0, + "step": 10273 + }, + { + "epoch": 1.1282670766527565, + "grad_norm": 2.5310773849487305, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7125946879386902, + "num_tokens": 260135460.0, + "step": 10274 + }, + { + "epoch": 1.12837689435537, + "grad_norm": 2.2480580806732178, + "learning_rate": 1e-06, + "loss": 0.8768, + "mean_token_accuracy": 0.7244341969490051, + "num_tokens": 260160389.0, + "step": 10275 + }, + { + "epoch": 1.1284867120579838, + "grad_norm": 2.1504883766174316, + "learning_rate": 1e-06, + "loss": 0.8799, + "mean_token_accuracy": 0.7244479656219482, + "num_tokens": 260185947.0, + "step": 10276 + }, + { + "epoch": 1.1285965297605973, + "grad_norm": 2.748598098754883, + "learning_rate": 1e-06, + "loss": 0.8908, + "mean_token_accuracy": 0.7234597206115723, + "num_tokens": 260203569.0, + "step": 10277 + }, + { + "epoch": 1.128706347463211, + "grad_norm": 2.650993585586548, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7087109684944153, + "num_tokens": 260225214.0, + "step": 10278 + }, + { + "epoch": 1.1288161651658246, + "grad_norm": 2.143421173095703, + "learning_rate": 1e-06, + "loss": 0.989, + "mean_token_accuracy": 0.6967779994010925, + "num_tokens": 260253793.0, + "step": 10279 + }, + { + "epoch": 1.1289259828684384, + "grad_norm": 2.3668646812438965, + "learning_rate": 1e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.7175914645195007, + "num_tokens": 260278623.0, + "step": 10280 + }, + { + "epoch": 1.1290358005710521, + "grad_norm": 2.0607399940490723, + "learning_rate": 1e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7177611589431763, + "num_tokens": 260306759.0, + "step": 10281 + }, + { + "epoch": 1.1291456182736657, + "grad_norm": 2.5192525386810303, + "learning_rate": 1e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.7220059633255005, + "num_tokens": 260328594.0, + "step": 10282 + }, + { + "epoch": 1.1292554359762794, + "grad_norm": 2.08510422706604, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.7011463642120361, + "num_tokens": 260357188.0, + "step": 10283 + }, + { + "epoch": 1.129365253678893, + "grad_norm": 2.1025068759918213, + "learning_rate": 1e-06, + "loss": 0.9954, + "mean_token_accuracy": 0.6944836974143982, + "num_tokens": 260387458.0, + "step": 10284 + }, + { + "epoch": 1.1294750713815067, + "grad_norm": 2.036774158477783, + "learning_rate": 1e-06, + "loss": 0.9084, + "mean_token_accuracy": 0.7276495099067688, + "num_tokens": 260419192.0, + "step": 10285 + }, + { + "epoch": 1.1295848890841205, + "grad_norm": 2.37113356590271, + "learning_rate": 1e-06, + "loss": 0.9127, + "mean_token_accuracy": 0.7173168063163757, + "num_tokens": 260443517.0, + "step": 10286 + }, + { + "epoch": 1.129694706786734, + "grad_norm": 2.4760890007019043, + "learning_rate": 1e-06, + "loss": 0.8358, + "mean_token_accuracy": 0.7385849356651306, + "num_tokens": 260464506.0, + "step": 10287 + }, + { + "epoch": 1.1298045244893478, + "grad_norm": 2.406029462814331, + "learning_rate": 1e-06, + "loss": 0.7944, + "mean_token_accuracy": 0.7485287189483643, + "num_tokens": 260485541.0, + "step": 10288 + }, + { + "epoch": 1.1299143421919613, + "grad_norm": 2.3089897632598877, + "learning_rate": 1e-06, + "loss": 0.8523, + "mean_token_accuracy": 0.7298946380615234, + "num_tokens": 260509694.0, + "step": 10289 + }, + { + "epoch": 1.130024159894575, + "grad_norm": 2.054859161376953, + "learning_rate": 1e-06, + "loss": 1.0377, + "mean_token_accuracy": 0.6843743920326233, + "num_tokens": 260539218.0, + "step": 10290 + }, + { + "epoch": 1.1301339775971886, + "grad_norm": 2.3791568279266357, + "learning_rate": 1e-06, + "loss": 0.9282, + "mean_token_accuracy": 0.7190530896186829, + "num_tokens": 260561645.0, + "step": 10291 + }, + { + "epoch": 1.1302437952998023, + "grad_norm": 2.0944390296936035, + "learning_rate": 1e-06, + "loss": 0.908, + "mean_token_accuracy": 0.7206268310546875, + "num_tokens": 260590536.0, + "step": 10292 + }, + { + "epoch": 1.1303536130024159, + "grad_norm": 2.1192824840545654, + "learning_rate": 1e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7164126634597778, + "num_tokens": 260617921.0, + "step": 10293 + }, + { + "epoch": 1.1304634307050296, + "grad_norm": 2.681650400161743, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.7043052911758423, + "num_tokens": 260638175.0, + "step": 10294 + }, + { + "epoch": 1.1305732484076434, + "grad_norm": 2.451061487197876, + "learning_rate": 1e-06, + "loss": 0.9051, + "mean_token_accuracy": 0.7229722738265991, + "num_tokens": 260659857.0, + "step": 10295 + }, + { + "epoch": 1.130683066110257, + "grad_norm": 2.013223886489868, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.710013747215271, + "num_tokens": 260691722.0, + "step": 10296 + }, + { + "epoch": 1.1307928838128707, + "grad_norm": 2.3577492237091064, + "learning_rate": 1e-06, + "loss": 0.8506, + "mean_token_accuracy": 0.743349552154541, + "num_tokens": 260713777.0, + "step": 10297 + }, + { + "epoch": 1.1309027015154842, + "grad_norm": 2.420487642288208, + "learning_rate": 1e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7095301747322083, + "num_tokens": 260735402.0, + "step": 10298 + }, + { + "epoch": 1.131012519218098, + "grad_norm": 2.3361263275146484, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7130296230316162, + "num_tokens": 260760395.0, + "step": 10299 + }, + { + "epoch": 1.1311223369207117, + "grad_norm": 2.3147213459014893, + "learning_rate": 1e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7052823305130005, + "num_tokens": 260784511.0, + "step": 10300 + }, + { + "epoch": 1.1312321546233253, + "grad_norm": 2.297034740447998, + "learning_rate": 1e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.7045705914497375, + "num_tokens": 260809828.0, + "step": 10301 + }, + { + "epoch": 1.131341972325939, + "grad_norm": 2.2585794925689697, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7198237776756287, + "num_tokens": 260834450.0, + "step": 10302 + }, + { + "epoch": 1.1314517900285526, + "grad_norm": 2.2882583141326904, + "learning_rate": 1e-06, + "loss": 0.9499, + "mean_token_accuracy": 0.7061012387275696, + "num_tokens": 260861415.0, + "step": 10303 + }, + { + "epoch": 1.1315616077311663, + "grad_norm": 2.5997486114501953, + "learning_rate": 1e-06, + "loss": 0.8688, + "mean_token_accuracy": 0.7254663109779358, + "num_tokens": 260882945.0, + "step": 10304 + }, + { + "epoch": 1.1316714254337799, + "grad_norm": 2.3432297706604004, + "learning_rate": 1e-06, + "loss": 0.8951, + "mean_token_accuracy": 0.7294590473175049, + "num_tokens": 260905850.0, + "step": 10305 + }, + { + "epoch": 1.1317812431363936, + "grad_norm": 1.9645264148712158, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7204104661941528, + "num_tokens": 260939255.0, + "step": 10306 + }, + { + "epoch": 1.1318910608390071, + "grad_norm": 2.1932196617126465, + "learning_rate": 1e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.707200288772583, + "num_tokens": 260967339.0, + "step": 10307 + }, + { + "epoch": 1.132000878541621, + "grad_norm": 2.599491596221924, + "learning_rate": 1e-06, + "loss": 0.787, + "mean_token_accuracy": 0.7439322471618652, + "num_tokens": 260986230.0, + "step": 10308 + }, + { + "epoch": 1.1321106962442347, + "grad_norm": 2.12454891204834, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7081601023674011, + "num_tokens": 261014186.0, + "step": 10309 + }, + { + "epoch": 1.1322205139468482, + "grad_norm": 2.082594156265259, + "learning_rate": 1e-06, + "loss": 0.8284, + "mean_token_accuracy": 0.7379090785980225, + "num_tokens": 261040732.0, + "step": 10310 + }, + { + "epoch": 1.132330331649462, + "grad_norm": 2.166853189468384, + "learning_rate": 1e-06, + "loss": 0.9254, + "mean_token_accuracy": 0.7120276689529419, + "num_tokens": 261069261.0, + "step": 10311 + }, + { + "epoch": 1.1324401493520755, + "grad_norm": 2.4778103828430176, + "learning_rate": 1e-06, + "loss": 0.8695, + "mean_token_accuracy": 0.7264432311058044, + "num_tokens": 261091694.0, + "step": 10312 + }, + { + "epoch": 1.1325499670546892, + "grad_norm": 2.10418438911438, + "learning_rate": 1e-06, + "loss": 0.9902, + "mean_token_accuracy": 0.7037390470504761, + "num_tokens": 261120889.0, + "step": 10313 + }, + { + "epoch": 1.132659784757303, + "grad_norm": 2.799398422241211, + "learning_rate": 1e-06, + "loss": 0.8992, + "mean_token_accuracy": 0.7201269865036011, + "num_tokens": 261139548.0, + "step": 10314 + }, + { + "epoch": 1.1327696024599165, + "grad_norm": 2.7590389251708984, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7109398245811462, + "num_tokens": 261159167.0, + "step": 10315 + }, + { + "epoch": 1.1328794201625303, + "grad_norm": 2.1794936656951904, + "learning_rate": 1e-06, + "loss": 0.8761, + "mean_token_accuracy": 0.7210623621940613, + "num_tokens": 261186123.0, + "step": 10316 + }, + { + "epoch": 1.1329892378651438, + "grad_norm": 2.3020524978637695, + "learning_rate": 1e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.7130226492881775, + "num_tokens": 261210529.0, + "step": 10317 + }, + { + "epoch": 1.1330990555677576, + "grad_norm": 2.018188238143921, + "learning_rate": 1e-06, + "loss": 0.9905, + "mean_token_accuracy": 0.708681583404541, + "num_tokens": 261239918.0, + "step": 10318 + }, + { + "epoch": 1.1332088732703711, + "grad_norm": 2.2745227813720703, + "learning_rate": 1e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7132298946380615, + "num_tokens": 261266394.0, + "step": 10319 + }, + { + "epoch": 1.1333186909729849, + "grad_norm": 2.1531293392181396, + "learning_rate": 1e-06, + "loss": 0.841, + "mean_token_accuracy": 0.7362005114555359, + "num_tokens": 261291901.0, + "step": 10320 + }, + { + "epoch": 1.1334285086755984, + "grad_norm": 2.367982864379883, + "learning_rate": 1e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.707966685295105, + "num_tokens": 261318891.0, + "step": 10321 + }, + { + "epoch": 1.1335383263782122, + "grad_norm": 2.698946237564087, + "learning_rate": 1e-06, + "loss": 0.8504, + "mean_token_accuracy": 0.7332190275192261, + "num_tokens": 261336688.0, + "step": 10322 + }, + { + "epoch": 1.133648144080826, + "grad_norm": 2.0211381912231445, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7109733819961548, + "num_tokens": 261366911.0, + "step": 10323 + }, + { + "epoch": 1.1337579617834395, + "grad_norm": 2.528884172439575, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7107816934585571, + "num_tokens": 261389378.0, + "step": 10324 + }, + { + "epoch": 1.1338677794860532, + "grad_norm": 2.157172679901123, + "learning_rate": 1e-06, + "loss": 0.9799, + "mean_token_accuracy": 0.7003229856491089, + "num_tokens": 261421421.0, + "step": 10325 + }, + { + "epoch": 1.1339775971886668, + "grad_norm": 2.3479180335998535, + "learning_rate": 1e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.7014633417129517, + "num_tokens": 261447595.0, + "step": 10326 + }, + { + "epoch": 1.1340874148912805, + "grad_norm": 2.40118408203125, + "learning_rate": 1e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.7094126343727112, + "num_tokens": 261471945.0, + "step": 10327 + }, + { + "epoch": 1.134197232593894, + "grad_norm": 2.4243979454040527, + "learning_rate": 1e-06, + "loss": 0.8993, + "mean_token_accuracy": 0.722348153591156, + "num_tokens": 261494402.0, + "step": 10328 + }, + { + "epoch": 1.1343070502965078, + "grad_norm": 2.196040153503418, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.7001896500587463, + "num_tokens": 261523343.0, + "step": 10329 + }, + { + "epoch": 1.1344168679991213, + "grad_norm": 2.4514002799987793, + "learning_rate": 1e-06, + "loss": 0.8798, + "mean_token_accuracy": 0.7273097634315491, + "num_tokens": 261545118.0, + "step": 10330 + }, + { + "epoch": 1.134526685701735, + "grad_norm": 2.222320795059204, + "learning_rate": 1e-06, + "loss": 0.8879, + "mean_token_accuracy": 0.7246529459953308, + "num_tokens": 261569983.0, + "step": 10331 + }, + { + "epoch": 1.1346365034043489, + "grad_norm": 2.2123684883117676, + "learning_rate": 1e-06, + "loss": 0.9375, + "mean_token_accuracy": 0.7114102840423584, + "num_tokens": 261596936.0, + "step": 10332 + }, + { + "epoch": 1.1347463211069624, + "grad_norm": 2.329543113708496, + "learning_rate": 1e-06, + "loss": 0.8804, + "mean_token_accuracy": 0.7251077890396118, + "num_tokens": 261620070.0, + "step": 10333 + }, + { + "epoch": 1.1348561388095761, + "grad_norm": 2.2951223850250244, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7037888765335083, + "num_tokens": 261646768.0, + "step": 10334 + }, + { + "epoch": 1.1349659565121897, + "grad_norm": 2.1721365451812744, + "learning_rate": 1e-06, + "loss": 0.959, + "mean_token_accuracy": 0.6998064517974854, + "num_tokens": 261677744.0, + "step": 10335 + }, + { + "epoch": 1.1350757742148034, + "grad_norm": 2.241722583770752, + "learning_rate": 1e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7136063575744629, + "num_tokens": 261704249.0, + "step": 10336 + }, + { + "epoch": 1.1351855919174172, + "grad_norm": 2.0997061729431152, + "learning_rate": 1e-06, + "loss": 0.8462, + "mean_token_accuracy": 0.7359391450881958, + "num_tokens": 261731741.0, + "step": 10337 + }, + { + "epoch": 1.1352954096200307, + "grad_norm": 2.2156591415405273, + "learning_rate": 1e-06, + "loss": 0.9158, + "mean_token_accuracy": 0.7184561491012573, + "num_tokens": 261759240.0, + "step": 10338 + }, + { + "epoch": 1.1354052273226445, + "grad_norm": 2.5770113468170166, + "learning_rate": 1e-06, + "loss": 0.8383, + "mean_token_accuracy": 0.737964928150177, + "num_tokens": 261778867.0, + "step": 10339 + }, + { + "epoch": 1.135515045025258, + "grad_norm": 2.270827054977417, + "learning_rate": 1e-06, + "loss": 0.9411, + "mean_token_accuracy": 0.7019628286361694, + "num_tokens": 261804529.0, + "step": 10340 + }, + { + "epoch": 1.1356248627278718, + "grad_norm": 2.0904486179351807, + "learning_rate": 1e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.6950887441635132, + "num_tokens": 261834050.0, + "step": 10341 + }, + { + "epoch": 1.1357346804304853, + "grad_norm": 2.7016665935516357, + "learning_rate": 1e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.7334858179092407, + "num_tokens": 261854194.0, + "step": 10342 + }, + { + "epoch": 1.135844498133099, + "grad_norm": 2.020312547683716, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.7176992893218994, + "num_tokens": 261884888.0, + "step": 10343 + }, + { + "epoch": 1.1359543158357126, + "grad_norm": 2.605224847793579, + "learning_rate": 1e-06, + "loss": 0.985, + "mean_token_accuracy": 0.7050830125808716, + "num_tokens": 261909558.0, + "step": 10344 + }, + { + "epoch": 1.1360641335383264, + "grad_norm": 2.2238879203796387, + "learning_rate": 1e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7165036201477051, + "num_tokens": 261935209.0, + "step": 10345 + }, + { + "epoch": 1.1361739512409401, + "grad_norm": 2.1911909580230713, + "learning_rate": 1e-06, + "loss": 0.8942, + "mean_token_accuracy": 0.7202613353729248, + "num_tokens": 261960587.0, + "step": 10346 + }, + { + "epoch": 1.1362837689435537, + "grad_norm": 2.3681187629699707, + "learning_rate": 1e-06, + "loss": 0.8553, + "mean_token_accuracy": 0.7295175790786743, + "num_tokens": 261983200.0, + "step": 10347 + }, + { + "epoch": 1.1363935866461674, + "grad_norm": 2.1425349712371826, + "learning_rate": 1e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.6952930688858032, + "num_tokens": 262013990.0, + "step": 10348 + }, + { + "epoch": 1.136503404348781, + "grad_norm": 2.382387399673462, + "learning_rate": 1e-06, + "loss": 0.7923, + "mean_token_accuracy": 0.7516125440597534, + "num_tokens": 262035457.0, + "step": 10349 + }, + { + "epoch": 1.1366132220513947, + "grad_norm": 2.4111897945404053, + "learning_rate": 1e-06, + "loss": 0.9127, + "mean_token_accuracy": 0.7156542539596558, + "num_tokens": 262058530.0, + "step": 10350 + }, + { + "epoch": 1.1367230397540085, + "grad_norm": 2.504193067550659, + "learning_rate": 1e-06, + "loss": 0.8751, + "mean_token_accuracy": 0.7252670526504517, + "num_tokens": 262078950.0, + "step": 10351 + }, + { + "epoch": 1.136832857456622, + "grad_norm": 2.35937762260437, + "learning_rate": 1e-06, + "loss": 0.9022, + "mean_token_accuracy": 0.7211773991584778, + "num_tokens": 262102968.0, + "step": 10352 + }, + { + "epoch": 1.1369426751592357, + "grad_norm": 2.2888827323913574, + "learning_rate": 1e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7165639400482178, + "num_tokens": 262127049.0, + "step": 10353 + }, + { + "epoch": 1.1370524928618493, + "grad_norm": 2.499905586242676, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.6975952386856079, + "num_tokens": 262150313.0, + "step": 10354 + }, + { + "epoch": 1.137162310564463, + "grad_norm": 2.177748441696167, + "learning_rate": 1e-06, + "loss": 0.8545, + "mean_token_accuracy": 0.7378286123275757, + "num_tokens": 262173868.0, + "step": 10355 + }, + { + "epoch": 1.1372721282670766, + "grad_norm": 2.334820032119751, + "learning_rate": 1e-06, + "loss": 0.8262, + "mean_token_accuracy": 0.7355225682258606, + "num_tokens": 262196865.0, + "step": 10356 + }, + { + "epoch": 1.1373819459696903, + "grad_norm": 1.97780179977417, + "learning_rate": 1e-06, + "loss": 0.887, + "mean_token_accuracy": 0.7311004400253296, + "num_tokens": 262227087.0, + "step": 10357 + }, + { + "epoch": 1.1374917636723039, + "grad_norm": 2.3529813289642334, + "learning_rate": 1e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.7179247140884399, + "num_tokens": 262249942.0, + "step": 10358 + }, + { + "epoch": 1.1376015813749176, + "grad_norm": 2.1605212688446045, + "learning_rate": 1e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.710141658782959, + "num_tokens": 262276513.0, + "step": 10359 + }, + { + "epoch": 1.1377113990775314, + "grad_norm": 2.147378921508789, + "learning_rate": 1e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.7115514278411865, + "num_tokens": 262304662.0, + "step": 10360 + }, + { + "epoch": 1.137821216780145, + "grad_norm": 2.217410087585449, + "learning_rate": 1e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.7000748515129089, + "num_tokens": 262331422.0, + "step": 10361 + }, + { + "epoch": 1.1379310344827587, + "grad_norm": 2.3822360038757324, + "learning_rate": 1e-06, + "loss": 0.8234, + "mean_token_accuracy": 0.7375080585479736, + "num_tokens": 262354699.0, + "step": 10362 + }, + { + "epoch": 1.1380408521853722, + "grad_norm": 2.6528942584991455, + "learning_rate": 1e-06, + "loss": 0.8382, + "mean_token_accuracy": 0.7279417514801025, + "num_tokens": 262373197.0, + "step": 10363 + }, + { + "epoch": 1.138150669887986, + "grad_norm": 2.2072222232818604, + "learning_rate": 1e-06, + "loss": 0.9073, + "mean_token_accuracy": 0.7208622694015503, + "num_tokens": 262398088.0, + "step": 10364 + }, + { + "epoch": 1.1382604875905997, + "grad_norm": 2.1109416484832764, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7101119756698608, + "num_tokens": 262427835.0, + "step": 10365 + }, + { + "epoch": 1.1383703052932133, + "grad_norm": 2.2955315113067627, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.724224328994751, + "num_tokens": 262452174.0, + "step": 10366 + }, + { + "epoch": 1.138480122995827, + "grad_norm": 1.958706021308899, + "learning_rate": 1e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.7176271677017212, + "num_tokens": 262485038.0, + "step": 10367 + }, + { + "epoch": 1.1385899406984406, + "grad_norm": 2.17156720161438, + "learning_rate": 1e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.7116905450820923, + "num_tokens": 262511879.0, + "step": 10368 + }, + { + "epoch": 1.1386997584010543, + "grad_norm": 2.269388198852539, + "learning_rate": 1e-06, + "loss": 0.8977, + "mean_token_accuracy": 0.7109771370887756, + "num_tokens": 262536505.0, + "step": 10369 + }, + { + "epoch": 1.1388095761036678, + "grad_norm": 2.197906732559204, + "learning_rate": 1e-06, + "loss": 0.8798, + "mean_token_accuracy": 0.7280920147895813, + "num_tokens": 262562026.0, + "step": 10370 + }, + { + "epoch": 1.1389193938062816, + "grad_norm": 2.330294132232666, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7084890604019165, + "num_tokens": 262586248.0, + "step": 10371 + }, + { + "epoch": 1.1390292115088951, + "grad_norm": 2.184563159942627, + "learning_rate": 1e-06, + "loss": 0.8339, + "mean_token_accuracy": 0.7335236072540283, + "num_tokens": 262611941.0, + "step": 10372 + }, + { + "epoch": 1.139139029211509, + "grad_norm": 2.0666778087615967, + "learning_rate": 1e-06, + "loss": 0.9775, + "mean_token_accuracy": 0.6971378326416016, + "num_tokens": 262641222.0, + "step": 10373 + }, + { + "epoch": 1.1392488469141226, + "grad_norm": 2.1188087463378906, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.6914111375808716, + "num_tokens": 262670227.0, + "step": 10374 + }, + { + "epoch": 1.1393586646167362, + "grad_norm": 2.206943988800049, + "learning_rate": 1e-06, + "loss": 0.9089, + "mean_token_accuracy": 0.7132574319839478, + "num_tokens": 262697196.0, + "step": 10375 + }, + { + "epoch": 1.13946848231935, + "grad_norm": 2.3423564434051514, + "learning_rate": 1e-06, + "loss": 0.8725, + "mean_token_accuracy": 0.7218571901321411, + "num_tokens": 262721134.0, + "step": 10376 + }, + { + "epoch": 1.1395783000219635, + "grad_norm": 2.332056999206543, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7236437797546387, + "num_tokens": 262743604.0, + "step": 10377 + }, + { + "epoch": 1.1396881177245772, + "grad_norm": 2.0493199825286865, + "learning_rate": 1e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.70358806848526, + "num_tokens": 262774463.0, + "step": 10378 + }, + { + "epoch": 1.1397979354271908, + "grad_norm": 2.1935019493103027, + "learning_rate": 1e-06, + "loss": 0.79, + "mean_token_accuracy": 0.7625964283943176, + "num_tokens": 262798936.0, + "step": 10379 + }, + { + "epoch": 1.1399077531298045, + "grad_norm": 2.6006827354431152, + "learning_rate": 1e-06, + "loss": 0.8278, + "mean_token_accuracy": 0.7406928539276123, + "num_tokens": 262819011.0, + "step": 10380 + }, + { + "epoch": 1.1400175708324183, + "grad_norm": 2.1414427757263184, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7132938504219055, + "num_tokens": 262846440.0, + "step": 10381 + }, + { + "epoch": 1.1401273885350318, + "grad_norm": 2.19293212890625, + "learning_rate": 1e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.7148100733757019, + "num_tokens": 262872190.0, + "step": 10382 + }, + { + "epoch": 1.1402372062376456, + "grad_norm": 2.385648012161255, + "learning_rate": 1e-06, + "loss": 0.9477, + "mean_token_accuracy": 0.7127951383590698, + "num_tokens": 262897407.0, + "step": 10383 + }, + { + "epoch": 1.140347023940259, + "grad_norm": 2.0318233966827393, + "learning_rate": 1e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.7046715021133423, + "num_tokens": 262929243.0, + "step": 10384 + }, + { + "epoch": 1.1404568416428729, + "grad_norm": 2.5010037422180176, + "learning_rate": 1e-06, + "loss": 0.8706, + "mean_token_accuracy": 0.7249854803085327, + "num_tokens": 262949967.0, + "step": 10385 + }, + { + "epoch": 1.1405666593454864, + "grad_norm": 2.6281306743621826, + "learning_rate": 1e-06, + "loss": 0.8292, + "mean_token_accuracy": 0.7431182861328125, + "num_tokens": 262968925.0, + "step": 10386 + }, + { + "epoch": 1.1406764770481002, + "grad_norm": 2.192272424697876, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7206554412841797, + "num_tokens": 262995393.0, + "step": 10387 + }, + { + "epoch": 1.140786294750714, + "grad_norm": 2.1890480518341064, + "learning_rate": 1e-06, + "loss": 0.9477, + "mean_token_accuracy": 0.7132711410522461, + "num_tokens": 263021488.0, + "step": 10388 + }, + { + "epoch": 1.1408961124533274, + "grad_norm": 2.313969850540161, + "learning_rate": 1e-06, + "loss": 0.8552, + "mean_token_accuracy": 0.7356091737747192, + "num_tokens": 263043065.0, + "step": 10389 + }, + { + "epoch": 1.1410059301559412, + "grad_norm": 2.245896816253662, + "learning_rate": 1e-06, + "loss": 0.836, + "mean_token_accuracy": 0.7359967231750488, + "num_tokens": 263067647.0, + "step": 10390 + }, + { + "epoch": 1.1411157478585547, + "grad_norm": 2.0210306644439697, + "learning_rate": 1e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.7063621878623962, + "num_tokens": 263101007.0, + "step": 10391 + }, + { + "epoch": 1.1412255655611685, + "grad_norm": 2.147953748703003, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.696133553981781, + "num_tokens": 263128937.0, + "step": 10392 + }, + { + "epoch": 1.141335383263782, + "grad_norm": 2.183295726776123, + "learning_rate": 1e-06, + "loss": 0.9788, + "mean_token_accuracy": 0.696402907371521, + "num_tokens": 263159459.0, + "step": 10393 + }, + { + "epoch": 1.1414452009663958, + "grad_norm": 2.4226458072662354, + "learning_rate": 1e-06, + "loss": 0.8523, + "mean_token_accuracy": 0.7344810962677002, + "num_tokens": 263181973.0, + "step": 10394 + }, + { + "epoch": 1.1415550186690093, + "grad_norm": 2.483079433441162, + "learning_rate": 1e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.7177637815475464, + "num_tokens": 263204494.0, + "step": 10395 + }, + { + "epoch": 1.141664836371623, + "grad_norm": 2.1011197566986084, + "learning_rate": 1e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.7127306461334229, + "num_tokens": 263233591.0, + "step": 10396 + }, + { + "epoch": 1.1417746540742368, + "grad_norm": 2.4418203830718994, + "learning_rate": 1e-06, + "loss": 0.7683, + "mean_token_accuracy": 0.7619673013687134, + "num_tokens": 263254684.0, + "step": 10397 + }, + { + "epoch": 1.1418844717768504, + "grad_norm": 2.1983535289764404, + "learning_rate": 1e-06, + "loss": 0.8474, + "mean_token_accuracy": 0.7269437313079834, + "num_tokens": 263281472.0, + "step": 10398 + }, + { + "epoch": 1.1419942894794641, + "grad_norm": 2.227931499481201, + "learning_rate": 1e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.7187540531158447, + "num_tokens": 263306767.0, + "step": 10399 + }, + { + "epoch": 1.1421041071820777, + "grad_norm": 2.282125473022461, + "learning_rate": 1e-06, + "loss": 0.9462, + "mean_token_accuracy": 0.7128453254699707, + "num_tokens": 263335881.0, + "step": 10400 + }, + { + "epoch": 1.1422139248846914, + "grad_norm": 2.531737804412842, + "learning_rate": 1e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.7126343250274658, + "num_tokens": 263362206.0, + "step": 10401 + }, + { + "epoch": 1.1423237425873052, + "grad_norm": 2.3237884044647217, + "learning_rate": 1e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.6912170648574829, + "num_tokens": 263389463.0, + "step": 10402 + }, + { + "epoch": 1.1424335602899187, + "grad_norm": 2.4082298278808594, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7021838426589966, + "num_tokens": 263412857.0, + "step": 10403 + }, + { + "epoch": 1.1425433779925325, + "grad_norm": 2.289874315261841, + "learning_rate": 1e-06, + "loss": 0.9373, + "mean_token_accuracy": 0.705924391746521, + "num_tokens": 263438640.0, + "step": 10404 + }, + { + "epoch": 1.142653195695146, + "grad_norm": 2.4231369495391846, + "learning_rate": 1e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.7154211401939392, + "num_tokens": 263460401.0, + "step": 10405 + }, + { + "epoch": 1.1427630133977598, + "grad_norm": 2.166201591491699, + "learning_rate": 1e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.7256774306297302, + "num_tokens": 263487264.0, + "step": 10406 + }, + { + "epoch": 1.1428728311003733, + "grad_norm": 2.256596326828003, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7113624811172485, + "num_tokens": 263515157.0, + "step": 10407 + }, + { + "epoch": 1.142982648802987, + "grad_norm": 2.5749809741973877, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.7175091505050659, + "num_tokens": 263537992.0, + "step": 10408 + }, + { + "epoch": 1.1430924665056006, + "grad_norm": 2.17826509475708, + "learning_rate": 1e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.7118614912033081, + "num_tokens": 263564936.0, + "step": 10409 + }, + { + "epoch": 1.1432022842082143, + "grad_norm": 2.247328281402588, + "learning_rate": 1e-06, + "loss": 0.8854, + "mean_token_accuracy": 0.7216624021530151, + "num_tokens": 263590200.0, + "step": 10410 + }, + { + "epoch": 1.143312101910828, + "grad_norm": 2.5196239948272705, + "learning_rate": 1e-06, + "loss": 0.8882, + "mean_token_accuracy": 0.7184465527534485, + "num_tokens": 263612205.0, + "step": 10411 + }, + { + "epoch": 1.1434219196134416, + "grad_norm": 2.6902153491973877, + "learning_rate": 1e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7169845700263977, + "num_tokens": 263632323.0, + "step": 10412 + }, + { + "epoch": 1.1435317373160554, + "grad_norm": 2.5976040363311768, + "learning_rate": 1e-06, + "loss": 0.8453, + "mean_token_accuracy": 0.7410068511962891, + "num_tokens": 263652129.0, + "step": 10413 + }, + { + "epoch": 1.143641555018669, + "grad_norm": 2.290679931640625, + "learning_rate": 1e-06, + "loss": 0.8673, + "mean_token_accuracy": 0.7338167428970337, + "num_tokens": 263678727.0, + "step": 10414 + }, + { + "epoch": 1.1437513727212827, + "grad_norm": 2.292658567428589, + "learning_rate": 1e-06, + "loss": 0.8769, + "mean_token_accuracy": 0.7227241396903992, + "num_tokens": 263703110.0, + "step": 10415 + }, + { + "epoch": 1.1438611904238964, + "grad_norm": 2.2952380180358887, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7015455961227417, + "num_tokens": 263731194.0, + "step": 10416 + }, + { + "epoch": 1.14397100812651, + "grad_norm": 2.1520726680755615, + "learning_rate": 1e-06, + "loss": 0.894, + "mean_token_accuracy": 0.728537917137146, + "num_tokens": 263758836.0, + "step": 10417 + }, + { + "epoch": 1.1440808258291237, + "grad_norm": 2.215697765350342, + "learning_rate": 1e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.6978700160980225, + "num_tokens": 263784578.0, + "step": 10418 + }, + { + "epoch": 1.1441906435317373, + "grad_norm": 2.387505054473877, + "learning_rate": 1e-06, + "loss": 0.823, + "mean_token_accuracy": 0.7376053333282471, + "num_tokens": 263807127.0, + "step": 10419 + }, + { + "epoch": 1.144300461234351, + "grad_norm": 2.0477545261383057, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.7005230188369751, + "num_tokens": 263838447.0, + "step": 10420 + }, + { + "epoch": 1.1444102789369646, + "grad_norm": 2.3787665367126465, + "learning_rate": 1e-06, + "loss": 0.9206, + "mean_token_accuracy": 0.7237029075622559, + "num_tokens": 263861683.0, + "step": 10421 + }, + { + "epoch": 1.1445200966395783, + "grad_norm": 2.0605034828186035, + "learning_rate": 1e-06, + "loss": 0.9192, + "mean_token_accuracy": 0.7156511545181274, + "num_tokens": 263892699.0, + "step": 10422 + }, + { + "epoch": 1.1446299143421919, + "grad_norm": 2.2696592807769775, + "learning_rate": 1e-06, + "loss": 0.9004, + "mean_token_accuracy": 0.7210532426834106, + "num_tokens": 263917657.0, + "step": 10423 + }, + { + "epoch": 1.1447397320448056, + "grad_norm": 2.2292842864990234, + "learning_rate": 1e-06, + "loss": 0.9045, + "mean_token_accuracy": 0.7276778221130371, + "num_tokens": 263944358.0, + "step": 10424 + }, + { + "epoch": 1.1448495497474194, + "grad_norm": 2.4976613521575928, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7210338115692139, + "num_tokens": 263966647.0, + "step": 10425 + }, + { + "epoch": 1.144959367450033, + "grad_norm": 2.584951877593994, + "learning_rate": 1e-06, + "loss": 0.8693, + "mean_token_accuracy": 0.731157660484314, + "num_tokens": 263988524.0, + "step": 10426 + }, + { + "epoch": 1.1450691851526467, + "grad_norm": 2.368515968322754, + "learning_rate": 1e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.7126411199569702, + "num_tokens": 264012903.0, + "step": 10427 + }, + { + "epoch": 1.1451790028552602, + "grad_norm": 2.6043407917022705, + "learning_rate": 1e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.7053369879722595, + "num_tokens": 264032303.0, + "step": 10428 + }, + { + "epoch": 1.145288820557874, + "grad_norm": 2.228971481323242, + "learning_rate": 1e-06, + "loss": 0.8256, + "mean_token_accuracy": 0.7408739328384399, + "num_tokens": 264058362.0, + "step": 10429 + }, + { + "epoch": 1.1453986382604877, + "grad_norm": 2.2640693187713623, + "learning_rate": 1e-06, + "loss": 0.917, + "mean_token_accuracy": 0.7125785946846008, + "num_tokens": 264084436.0, + "step": 10430 + }, + { + "epoch": 1.1455084559631012, + "grad_norm": 2.2120461463928223, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.717578649520874, + "num_tokens": 264110796.0, + "step": 10431 + }, + { + "epoch": 1.145618273665715, + "grad_norm": 2.2655885219573975, + "learning_rate": 1e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.7130370140075684, + "num_tokens": 264136667.0, + "step": 10432 + }, + { + "epoch": 1.1457280913683285, + "grad_norm": 2.47957181930542, + "learning_rate": 1e-06, + "loss": 0.8636, + "mean_token_accuracy": 0.726984441280365, + "num_tokens": 264157464.0, + "step": 10433 + }, + { + "epoch": 1.1458379090709423, + "grad_norm": 2.5323433876037598, + "learning_rate": 1e-06, + "loss": 0.8355, + "mean_token_accuracy": 0.7345811128616333, + "num_tokens": 264177839.0, + "step": 10434 + }, + { + "epoch": 1.1459477267735558, + "grad_norm": 2.276076316833496, + "learning_rate": 1e-06, + "loss": 0.8592, + "mean_token_accuracy": 0.7425048351287842, + "num_tokens": 264203180.0, + "step": 10435 + }, + { + "epoch": 1.1460575444761696, + "grad_norm": 2.1796133518218994, + "learning_rate": 1e-06, + "loss": 0.8889, + "mean_token_accuracy": 0.7246150970458984, + "num_tokens": 264229893.0, + "step": 10436 + }, + { + "epoch": 1.1461673621787831, + "grad_norm": 2.3768904209136963, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7093325853347778, + "num_tokens": 264254031.0, + "step": 10437 + }, + { + "epoch": 1.1462771798813969, + "grad_norm": 2.5657918453216553, + "learning_rate": 1e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.7215930223464966, + "num_tokens": 264275132.0, + "step": 10438 + }, + { + "epoch": 1.1463869975840106, + "grad_norm": 2.395925521850586, + "learning_rate": 1e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.7129217982292175, + "num_tokens": 264298123.0, + "step": 10439 + }, + { + "epoch": 1.1464968152866242, + "grad_norm": 2.416536569595337, + "learning_rate": 1e-06, + "loss": 0.8537, + "mean_token_accuracy": 0.7271188497543335, + "num_tokens": 264320614.0, + "step": 10440 + }, + { + "epoch": 1.146606632989238, + "grad_norm": 2.3636810779571533, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.7116003632545471, + "num_tokens": 264345807.0, + "step": 10441 + }, + { + "epoch": 1.1467164506918515, + "grad_norm": 2.4336729049682617, + "learning_rate": 1e-06, + "loss": 0.9042, + "mean_token_accuracy": 0.7172680497169495, + "num_tokens": 264371218.0, + "step": 10442 + }, + { + "epoch": 1.1468262683944652, + "grad_norm": 2.3812174797058105, + "learning_rate": 1e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.7059367299079895, + "num_tokens": 264394605.0, + "step": 10443 + }, + { + "epoch": 1.1469360860970788, + "grad_norm": 2.153705358505249, + "learning_rate": 1e-06, + "loss": 0.7852, + "mean_token_accuracy": 0.7533419132232666, + "num_tokens": 264420366.0, + "step": 10444 + }, + { + "epoch": 1.1470459037996925, + "grad_norm": 2.1429922580718994, + "learning_rate": 1e-06, + "loss": 0.854, + "mean_token_accuracy": 0.7393622994422913, + "num_tokens": 264444564.0, + "step": 10445 + }, + { + "epoch": 1.147155721502306, + "grad_norm": 2.1641457080841064, + "learning_rate": 1e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.7175207138061523, + "num_tokens": 264471918.0, + "step": 10446 + }, + { + "epoch": 1.1472655392049198, + "grad_norm": 2.0418269634246826, + "learning_rate": 1e-06, + "loss": 0.8899, + "mean_token_accuracy": 0.7268304824829102, + "num_tokens": 264501340.0, + "step": 10447 + }, + { + "epoch": 1.1473753569075336, + "grad_norm": 2.7527687549591064, + "learning_rate": 1e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.697326123714447, + "num_tokens": 264520749.0, + "step": 10448 + }, + { + "epoch": 1.147485174610147, + "grad_norm": 2.328155994415283, + "learning_rate": 1e-06, + "loss": 0.8594, + "mean_token_accuracy": 0.7295284271240234, + "num_tokens": 264543834.0, + "step": 10449 + }, + { + "epoch": 1.1475949923127609, + "grad_norm": 2.311096429824829, + "learning_rate": 1e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.7121284008026123, + "num_tokens": 264568473.0, + "step": 10450 + }, + { + "epoch": 1.1477048100153744, + "grad_norm": 2.2088098526000977, + "learning_rate": 1e-06, + "loss": 0.9137, + "mean_token_accuracy": 0.7210291624069214, + "num_tokens": 264594625.0, + "step": 10451 + }, + { + "epoch": 1.1478146277179881, + "grad_norm": 2.118176221847534, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.7001463174819946, + "num_tokens": 264623066.0, + "step": 10452 + }, + { + "epoch": 1.147924445420602, + "grad_norm": 2.6714751720428467, + "learning_rate": 1e-06, + "loss": 0.9011, + "mean_token_accuracy": 0.7180655598640442, + "num_tokens": 264643323.0, + "step": 10453 + }, + { + "epoch": 1.1480342631232154, + "grad_norm": 2.5644867420196533, + "learning_rate": 1e-06, + "loss": 0.8471, + "mean_token_accuracy": 0.7361376881599426, + "num_tokens": 264665019.0, + "step": 10454 + }, + { + "epoch": 1.1481440808258292, + "grad_norm": 2.3894197940826416, + "learning_rate": 1e-06, + "loss": 0.8321, + "mean_token_accuracy": 0.7376894950866699, + "num_tokens": 264688049.0, + "step": 10455 + }, + { + "epoch": 1.1482538985284427, + "grad_norm": 2.2067666053771973, + "learning_rate": 1e-06, + "loss": 0.8816, + "mean_token_accuracy": 0.7258174419403076, + "num_tokens": 264713773.0, + "step": 10456 + }, + { + "epoch": 1.1483637162310565, + "grad_norm": 2.149409532546997, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7069292068481445, + "num_tokens": 264744124.0, + "step": 10457 + }, + { + "epoch": 1.14847353393367, + "grad_norm": 2.390123128890991, + "learning_rate": 1e-06, + "loss": 1.0027, + "mean_token_accuracy": 0.6953859329223633, + "num_tokens": 264768805.0, + "step": 10458 + }, + { + "epoch": 1.1485833516362838, + "grad_norm": 2.163813352584839, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.6953166723251343, + "num_tokens": 264797758.0, + "step": 10459 + }, + { + "epoch": 1.1486931693388973, + "grad_norm": 2.540611743927002, + "learning_rate": 1e-06, + "loss": 0.8854, + "mean_token_accuracy": 0.726540207862854, + "num_tokens": 264819979.0, + "step": 10460 + }, + { + "epoch": 1.148802987041511, + "grad_norm": 2.793200969696045, + "learning_rate": 1e-06, + "loss": 0.8858, + "mean_token_accuracy": 0.7277307510375977, + "num_tokens": 264838735.0, + "step": 10461 + }, + { + "epoch": 1.1489128047441248, + "grad_norm": 2.3686344623565674, + "learning_rate": 1e-06, + "loss": 0.8746, + "mean_token_accuracy": 0.7302142381668091, + "num_tokens": 264860019.0, + "step": 10462 + }, + { + "epoch": 1.1490226224467384, + "grad_norm": 2.2706074714660645, + "learning_rate": 1e-06, + "loss": 0.8135, + "mean_token_accuracy": 0.7414275407791138, + "num_tokens": 264884003.0, + "step": 10463 + }, + { + "epoch": 1.1491324401493521, + "grad_norm": 2.3679299354553223, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7146151661872864, + "num_tokens": 264906793.0, + "step": 10464 + }, + { + "epoch": 1.1492422578519657, + "grad_norm": 2.471444606781006, + "learning_rate": 1e-06, + "loss": 0.8266, + "mean_token_accuracy": 0.7410696744918823, + "num_tokens": 264928778.0, + "step": 10465 + }, + { + "epoch": 1.1493520755545794, + "grad_norm": 2.098659038543701, + "learning_rate": 1e-06, + "loss": 0.9008, + "mean_token_accuracy": 0.7159502506256104, + "num_tokens": 264956773.0, + "step": 10466 + }, + { + "epoch": 1.1494618932571932, + "grad_norm": 2.3210999965667725, + "learning_rate": 1e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.7326582074165344, + "num_tokens": 264982894.0, + "step": 10467 + }, + { + "epoch": 1.1495717109598067, + "grad_norm": 2.3033485412597656, + "learning_rate": 1e-06, + "loss": 0.8327, + "mean_token_accuracy": 0.736128032207489, + "num_tokens": 265005894.0, + "step": 10468 + }, + { + "epoch": 1.1496815286624205, + "grad_norm": 2.30517315864563, + "learning_rate": 1e-06, + "loss": 0.765, + "mean_token_accuracy": 0.7593916654586792, + "num_tokens": 265029451.0, + "step": 10469 + }, + { + "epoch": 1.149791346365034, + "grad_norm": 2.253586769104004, + "learning_rate": 1e-06, + "loss": 0.831, + "mean_token_accuracy": 0.7352315783500671, + "num_tokens": 265053176.0, + "step": 10470 + }, + { + "epoch": 1.1499011640676478, + "grad_norm": 2.314586639404297, + "learning_rate": 1e-06, + "loss": 0.884, + "mean_token_accuracy": 0.7209723591804504, + "num_tokens": 265077283.0, + "step": 10471 + }, + { + "epoch": 1.1500109817702613, + "grad_norm": 2.3432154655456543, + "learning_rate": 1e-06, + "loss": 0.878, + "mean_token_accuracy": 0.7262790203094482, + "num_tokens": 265100548.0, + "step": 10472 + }, + { + "epoch": 1.150120799472875, + "grad_norm": 2.39620304107666, + "learning_rate": 1e-06, + "loss": 0.8439, + "mean_token_accuracy": 0.7383614778518677, + "num_tokens": 265122816.0, + "step": 10473 + }, + { + "epoch": 1.1502306171754886, + "grad_norm": 2.0582997798919678, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7166461944580078, + "num_tokens": 265153567.0, + "step": 10474 + }, + { + "epoch": 1.1503404348781023, + "grad_norm": 2.2922887802124023, + "learning_rate": 1e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.7259677648544312, + "num_tokens": 265179452.0, + "step": 10475 + }, + { + "epoch": 1.150450252580716, + "grad_norm": 2.1403753757476807, + "learning_rate": 1e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7187163829803467, + "num_tokens": 265205946.0, + "step": 10476 + }, + { + "epoch": 1.1505600702833296, + "grad_norm": 2.1936869621276855, + "learning_rate": 1e-06, + "loss": 1.0203, + "mean_token_accuracy": 0.687322735786438, + "num_tokens": 265233463.0, + "step": 10477 + }, + { + "epoch": 1.1506698879859434, + "grad_norm": 2.238011121749878, + "learning_rate": 1e-06, + "loss": 0.8426, + "mean_token_accuracy": 0.7348167896270752, + "num_tokens": 265258349.0, + "step": 10478 + }, + { + "epoch": 1.150779705688557, + "grad_norm": 2.1951396465301514, + "learning_rate": 1e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.7141541242599487, + "num_tokens": 265285039.0, + "step": 10479 + }, + { + "epoch": 1.1508895233911707, + "grad_norm": 2.20752215385437, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7093991041183472, + "num_tokens": 265311482.0, + "step": 10480 + }, + { + "epoch": 1.1509993410937844, + "grad_norm": 2.1017873287200928, + "learning_rate": 1e-06, + "loss": 0.9753, + "mean_token_accuracy": 0.7115848064422607, + "num_tokens": 265342056.0, + "step": 10481 + }, + { + "epoch": 1.151109158796398, + "grad_norm": 2.2755134105682373, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7192322611808777, + "num_tokens": 265365988.0, + "step": 10482 + }, + { + "epoch": 1.1512189764990117, + "grad_norm": 2.4075701236724854, + "learning_rate": 1e-06, + "loss": 0.8781, + "mean_token_accuracy": 0.7351446151733398, + "num_tokens": 265388356.0, + "step": 10483 + }, + { + "epoch": 1.1513287942016253, + "grad_norm": 2.409588575363159, + "learning_rate": 1e-06, + "loss": 0.8794, + "mean_token_accuracy": 0.7243675589561462, + "num_tokens": 265410615.0, + "step": 10484 + }, + { + "epoch": 1.151438611904239, + "grad_norm": 2.1118061542510986, + "learning_rate": 1e-06, + "loss": 0.8398, + "mean_token_accuracy": 0.7435072660446167, + "num_tokens": 265436591.0, + "step": 10485 + }, + { + "epoch": 1.1515484296068526, + "grad_norm": 2.0090065002441406, + "learning_rate": 1e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.7050300240516663, + "num_tokens": 265468911.0, + "step": 10486 + }, + { + "epoch": 1.1516582473094663, + "grad_norm": 2.6906721591949463, + "learning_rate": 1e-06, + "loss": 0.8472, + "mean_token_accuracy": 0.726058840751648, + "num_tokens": 265488231.0, + "step": 10487 + }, + { + "epoch": 1.1517680650120798, + "grad_norm": 2.353919267654419, + "learning_rate": 1e-06, + "loss": 1.005, + "mean_token_accuracy": 0.7007520198822021, + "num_tokens": 265513202.0, + "step": 10488 + }, + { + "epoch": 1.1518778827146936, + "grad_norm": 2.097388744354248, + "learning_rate": 1e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.7237345576286316, + "num_tokens": 265540189.0, + "step": 10489 + }, + { + "epoch": 1.1519877004173074, + "grad_norm": 2.274704933166504, + "learning_rate": 1e-06, + "loss": 1.0067, + "mean_token_accuracy": 0.6914727687835693, + "num_tokens": 265566817.0, + "step": 10490 + }, + { + "epoch": 1.152097518119921, + "grad_norm": 2.528487205505371, + "learning_rate": 1e-06, + "loss": 0.8618, + "mean_token_accuracy": 0.7369256019592285, + "num_tokens": 265586741.0, + "step": 10491 + }, + { + "epoch": 1.1522073358225347, + "grad_norm": 2.166337013244629, + "learning_rate": 1e-06, + "loss": 0.9446, + "mean_token_accuracy": 0.7099473476409912, + "num_tokens": 265614457.0, + "step": 10492 + }, + { + "epoch": 1.1523171535251482, + "grad_norm": 2.397050142288208, + "learning_rate": 1e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7160804271697998, + "num_tokens": 265637703.0, + "step": 10493 + }, + { + "epoch": 1.152426971227762, + "grad_norm": 2.266360282897949, + "learning_rate": 1e-06, + "loss": 0.969, + "mean_token_accuracy": 0.7005112171173096, + "num_tokens": 265662831.0, + "step": 10494 + }, + { + "epoch": 1.1525367889303757, + "grad_norm": 2.220254421234131, + "learning_rate": 1e-06, + "loss": 0.9546, + "mean_token_accuracy": 0.7174782752990723, + "num_tokens": 265687677.0, + "step": 10495 + }, + { + "epoch": 1.1526466066329892, + "grad_norm": 2.145045518875122, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7083728313446045, + "num_tokens": 265718626.0, + "step": 10496 + }, + { + "epoch": 1.152756424335603, + "grad_norm": 2.2583391666412354, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7108616828918457, + "num_tokens": 265742410.0, + "step": 10497 + }, + { + "epoch": 1.1528662420382165, + "grad_norm": 2.2675609588623047, + "learning_rate": 1e-06, + "loss": 0.912, + "mean_token_accuracy": 0.7123894691467285, + "num_tokens": 265767156.0, + "step": 10498 + }, + { + "epoch": 1.1529760597408303, + "grad_norm": 2.4691808223724365, + "learning_rate": 1e-06, + "loss": 0.8579, + "mean_token_accuracy": 0.7229058742523193, + "num_tokens": 265789448.0, + "step": 10499 + }, + { + "epoch": 1.1530858774434438, + "grad_norm": 2.313663959503174, + "learning_rate": 1e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.7311522364616394, + "num_tokens": 265812720.0, + "step": 10500 + }, + { + "epoch": 1.1531956951460576, + "grad_norm": 2.37690806388855, + "learning_rate": 1e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.7062768936157227, + "num_tokens": 265836688.0, + "step": 10501 + }, + { + "epoch": 1.153305512848671, + "grad_norm": 2.1196014881134033, + "learning_rate": 1e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.7147406339645386, + "num_tokens": 265864394.0, + "step": 10502 + }, + { + "epoch": 1.1534153305512849, + "grad_norm": 2.1859617233276367, + "learning_rate": 1e-06, + "loss": 0.9213, + "mean_token_accuracy": 0.7126109004020691, + "num_tokens": 265891027.0, + "step": 10503 + }, + { + "epoch": 1.1535251482538986, + "grad_norm": 2.4013257026672363, + "learning_rate": 1e-06, + "loss": 0.8919, + "mean_token_accuracy": 0.7173469066619873, + "num_tokens": 265917151.0, + "step": 10504 + }, + { + "epoch": 1.1536349659565122, + "grad_norm": 2.220918655395508, + "learning_rate": 1e-06, + "loss": 0.886, + "mean_token_accuracy": 0.7288129329681396, + "num_tokens": 265942568.0, + "step": 10505 + }, + { + "epoch": 1.153744783659126, + "grad_norm": 2.3441901206970215, + "learning_rate": 1e-06, + "loss": 0.8475, + "mean_token_accuracy": 0.7425878643989563, + "num_tokens": 265964416.0, + "step": 10506 + }, + { + "epoch": 1.1538546013617395, + "grad_norm": 2.3681647777557373, + "learning_rate": 1e-06, + "loss": 0.9086, + "mean_token_accuracy": 0.7114163637161255, + "num_tokens": 265987219.0, + "step": 10507 + }, + { + "epoch": 1.1539644190643532, + "grad_norm": 2.2643871307373047, + "learning_rate": 1e-06, + "loss": 0.8915, + "mean_token_accuracy": 0.7193025350570679, + "num_tokens": 266011564.0, + "step": 10508 + }, + { + "epoch": 1.1540742367669667, + "grad_norm": 2.1184587478637695, + "learning_rate": 1e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.7190589308738708, + "num_tokens": 266038082.0, + "step": 10509 + }, + { + "epoch": 1.1541840544695805, + "grad_norm": 2.5652921199798584, + "learning_rate": 1e-06, + "loss": 0.8915, + "mean_token_accuracy": 0.7206023931503296, + "num_tokens": 266058397.0, + "step": 10510 + }, + { + "epoch": 1.154293872172194, + "grad_norm": 2.1208441257476807, + "learning_rate": 1e-06, + "loss": 0.8594, + "mean_token_accuracy": 0.732166588306427, + "num_tokens": 266086338.0, + "step": 10511 + }, + { + "epoch": 1.1544036898748078, + "grad_norm": 2.307973861694336, + "learning_rate": 1e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.7129009366035461, + "num_tokens": 266110935.0, + "step": 10512 + }, + { + "epoch": 1.1545135075774215, + "grad_norm": 2.012296676635742, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.691116452217102, + "num_tokens": 266142936.0, + "step": 10513 + }, + { + "epoch": 1.154623325280035, + "grad_norm": 2.5543458461761475, + "learning_rate": 1e-06, + "loss": 0.8394, + "mean_token_accuracy": 0.7341733574867249, + "num_tokens": 266164273.0, + "step": 10514 + }, + { + "epoch": 1.1547331429826488, + "grad_norm": 2.2558939456939697, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7090431451797485, + "num_tokens": 266189613.0, + "step": 10515 + }, + { + "epoch": 1.1548429606852624, + "grad_norm": 2.0360634326934814, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7112589478492737, + "num_tokens": 266220471.0, + "step": 10516 + }, + { + "epoch": 1.1549527783878761, + "grad_norm": 2.185157060623169, + "learning_rate": 1e-06, + "loss": 0.8693, + "mean_token_accuracy": 0.7266904711723328, + "num_tokens": 266247244.0, + "step": 10517 + }, + { + "epoch": 1.15506259609049, + "grad_norm": 2.3433170318603516, + "learning_rate": 1e-06, + "loss": 0.8436, + "mean_token_accuracy": 0.7383244037628174, + "num_tokens": 266269223.0, + "step": 10518 + }, + { + "epoch": 1.1551724137931034, + "grad_norm": 2.1231939792633057, + "learning_rate": 1e-06, + "loss": 1.0255, + "mean_token_accuracy": 0.700951337814331, + "num_tokens": 266301457.0, + "step": 10519 + }, + { + "epoch": 1.1552822314957172, + "grad_norm": 2.261918544769287, + "learning_rate": 1e-06, + "loss": 0.7825, + "mean_token_accuracy": 0.752332329750061, + "num_tokens": 266325740.0, + "step": 10520 + }, + { + "epoch": 1.1553920491983307, + "grad_norm": 2.6516621112823486, + "learning_rate": 1e-06, + "loss": 0.8498, + "mean_token_accuracy": 0.7346388697624207, + "num_tokens": 266344029.0, + "step": 10521 + }, + { + "epoch": 1.1555018669009445, + "grad_norm": 2.147543430328369, + "learning_rate": 1e-06, + "loss": 0.8446, + "mean_token_accuracy": 0.7343100905418396, + "num_tokens": 266370778.0, + "step": 10522 + }, + { + "epoch": 1.155611684603558, + "grad_norm": 2.0696303844451904, + "learning_rate": 1e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.7126563191413879, + "num_tokens": 266400691.0, + "step": 10523 + }, + { + "epoch": 1.1557215023061718, + "grad_norm": 2.2526698112487793, + "learning_rate": 1e-06, + "loss": 0.946, + "mean_token_accuracy": 0.7118194103240967, + "num_tokens": 266425953.0, + "step": 10524 + }, + { + "epoch": 1.1558313200087853, + "grad_norm": 2.3145923614501953, + "learning_rate": 1e-06, + "loss": 0.9055, + "mean_token_accuracy": 0.7178378105163574, + "num_tokens": 266450139.0, + "step": 10525 + }, + { + "epoch": 1.155941137711399, + "grad_norm": 2.085711717605591, + "learning_rate": 1e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.715323805809021, + "num_tokens": 266479757.0, + "step": 10526 + }, + { + "epoch": 1.1560509554140128, + "grad_norm": 2.377319097518921, + "learning_rate": 1e-06, + "loss": 0.9665, + "mean_token_accuracy": 0.6975457668304443, + "num_tokens": 266502581.0, + "step": 10527 + }, + { + "epoch": 1.1561607731166264, + "grad_norm": 2.760066509246826, + "learning_rate": 1e-06, + "loss": 0.8422, + "mean_token_accuracy": 0.7339165806770325, + "num_tokens": 266521785.0, + "step": 10528 + }, + { + "epoch": 1.15627059081924, + "grad_norm": 2.3213531970977783, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7047901153564453, + "num_tokens": 266545601.0, + "step": 10529 + }, + { + "epoch": 1.1563804085218536, + "grad_norm": 2.380112409591675, + "learning_rate": 1e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.7261753082275391, + "num_tokens": 266568353.0, + "step": 10530 + }, + { + "epoch": 1.1564902262244674, + "grad_norm": 2.3970589637756348, + "learning_rate": 1e-06, + "loss": 0.9209, + "mean_token_accuracy": 0.7195559740066528, + "num_tokens": 266591459.0, + "step": 10531 + }, + { + "epoch": 1.1566000439270812, + "grad_norm": 2.328217029571533, + "learning_rate": 1e-06, + "loss": 0.9441, + "mean_token_accuracy": 0.7027482986450195, + "num_tokens": 266616051.0, + "step": 10532 + }, + { + "epoch": 1.1567098616296947, + "grad_norm": 2.317370891571045, + "learning_rate": 1e-06, + "loss": 0.8544, + "mean_token_accuracy": 0.7383894920349121, + "num_tokens": 266639588.0, + "step": 10533 + }, + { + "epoch": 1.1568196793323084, + "grad_norm": 2.106398820877075, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7055578231811523, + "num_tokens": 266668086.0, + "step": 10534 + }, + { + "epoch": 1.156929497034922, + "grad_norm": 2.261976718902588, + "learning_rate": 1e-06, + "loss": 0.8544, + "mean_token_accuracy": 0.7391738891601562, + "num_tokens": 266691489.0, + "step": 10535 + }, + { + "epoch": 1.1570393147375357, + "grad_norm": 2.224191665649414, + "learning_rate": 1e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.7036923170089722, + "num_tokens": 266719752.0, + "step": 10536 + }, + { + "epoch": 1.1571491324401493, + "grad_norm": 2.301020383834839, + "learning_rate": 1e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.7164166569709778, + "num_tokens": 266744975.0, + "step": 10537 + }, + { + "epoch": 1.157258950142763, + "grad_norm": 2.364980697631836, + "learning_rate": 1e-06, + "loss": 0.7456, + "mean_token_accuracy": 0.7533678412437439, + "num_tokens": 266766702.0, + "step": 10538 + }, + { + "epoch": 1.1573687678453766, + "grad_norm": 2.4154129028320312, + "learning_rate": 1e-06, + "loss": 0.8578, + "mean_token_accuracy": 0.7280101776123047, + "num_tokens": 266790309.0, + "step": 10539 + }, + { + "epoch": 1.1574785855479903, + "grad_norm": 2.5102713108062744, + "learning_rate": 1e-06, + "loss": 0.8045, + "mean_token_accuracy": 0.7394888997077942, + "num_tokens": 266811851.0, + "step": 10540 + }, + { + "epoch": 1.157588403250604, + "grad_norm": 2.4564216136932373, + "learning_rate": 1e-06, + "loss": 0.7972, + "mean_token_accuracy": 0.7403202056884766, + "num_tokens": 266832566.0, + "step": 10541 + }, + { + "epoch": 1.1576982209532176, + "grad_norm": 2.3788065910339355, + "learning_rate": 1e-06, + "loss": 0.9047, + "mean_token_accuracy": 0.7169069051742554, + "num_tokens": 266855652.0, + "step": 10542 + }, + { + "epoch": 1.1578080386558314, + "grad_norm": 2.5743279457092285, + "learning_rate": 1e-06, + "loss": 0.8747, + "mean_token_accuracy": 0.7279332876205444, + "num_tokens": 266877407.0, + "step": 10543 + }, + { + "epoch": 1.157917856358445, + "grad_norm": 2.103602409362793, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.7092283368110657, + "num_tokens": 266907581.0, + "step": 10544 + }, + { + "epoch": 1.1580276740610587, + "grad_norm": 2.211940050125122, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.7042065262794495, + "num_tokens": 266934147.0, + "step": 10545 + }, + { + "epoch": 1.1581374917636724, + "grad_norm": 2.1090121269226074, + "learning_rate": 1e-06, + "loss": 0.8063, + "mean_token_accuracy": 0.7459927201271057, + "num_tokens": 266960767.0, + "step": 10546 + }, + { + "epoch": 1.158247309466286, + "grad_norm": 2.1459977626800537, + "learning_rate": 1e-06, + "loss": 0.9323, + "mean_token_accuracy": 0.7209784984588623, + "num_tokens": 266988867.0, + "step": 10547 + }, + { + "epoch": 1.1583571271688997, + "grad_norm": 2.3449952602386475, + "learning_rate": 1e-06, + "loss": 0.9413, + "mean_token_accuracy": 0.7157341241836548, + "num_tokens": 267012613.0, + "step": 10548 + }, + { + "epoch": 1.1584669448715132, + "grad_norm": 2.195814371109009, + "learning_rate": 1e-06, + "loss": 0.8836, + "mean_token_accuracy": 0.72176194190979, + "num_tokens": 267039502.0, + "step": 10549 + }, + { + "epoch": 1.158576762574127, + "grad_norm": 2.2656922340393066, + "learning_rate": 1e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.718769371509552, + "num_tokens": 267065369.0, + "step": 10550 + }, + { + "epoch": 1.1586865802767405, + "grad_norm": 2.353714942932129, + "learning_rate": 1e-06, + "loss": 0.9008, + "mean_token_accuracy": 0.7272510528564453, + "num_tokens": 267090196.0, + "step": 10551 + }, + { + "epoch": 1.1587963979793543, + "grad_norm": 2.4292852878570557, + "learning_rate": 1e-06, + "loss": 0.8268, + "mean_token_accuracy": 0.742922306060791, + "num_tokens": 267113362.0, + "step": 10552 + }, + { + "epoch": 1.1589062156819678, + "grad_norm": 2.169358968734741, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7146874070167542, + "num_tokens": 267140677.0, + "step": 10553 + }, + { + "epoch": 1.1590160333845816, + "grad_norm": 2.301870822906494, + "learning_rate": 1e-06, + "loss": 0.8734, + "mean_token_accuracy": 0.7221401929855347, + "num_tokens": 267163602.0, + "step": 10554 + }, + { + "epoch": 1.1591258510871953, + "grad_norm": 2.524785280227661, + "learning_rate": 1e-06, + "loss": 0.8059, + "mean_token_accuracy": 0.7404881119728088, + "num_tokens": 267183529.0, + "step": 10555 + }, + { + "epoch": 1.1592356687898089, + "grad_norm": 2.108851671218872, + "learning_rate": 1e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.7293797731399536, + "num_tokens": 267211539.0, + "step": 10556 + }, + { + "epoch": 1.1593454864924226, + "grad_norm": 2.305687189102173, + "learning_rate": 1e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.7217193841934204, + "num_tokens": 267235657.0, + "step": 10557 + }, + { + "epoch": 1.1594553041950362, + "grad_norm": 2.2795913219451904, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.704321026802063, + "num_tokens": 267259990.0, + "step": 10558 + }, + { + "epoch": 1.15956512189765, + "grad_norm": 2.3819684982299805, + "learning_rate": 1e-06, + "loss": 0.8206, + "mean_token_accuracy": 0.7452273368835449, + "num_tokens": 267282920.0, + "step": 10559 + }, + { + "epoch": 1.1596749396002635, + "grad_norm": 2.465022325515747, + "learning_rate": 1e-06, + "loss": 0.8674, + "mean_token_accuracy": 0.7360657453536987, + "num_tokens": 267306050.0, + "step": 10560 + }, + { + "epoch": 1.1597847573028772, + "grad_norm": 2.434826612472534, + "learning_rate": 1e-06, + "loss": 0.8252, + "mean_token_accuracy": 0.7472052574157715, + "num_tokens": 267329271.0, + "step": 10561 + }, + { + "epoch": 1.159894575005491, + "grad_norm": 2.405219793319702, + "learning_rate": 1e-06, + "loss": 0.8032, + "mean_token_accuracy": 0.7430611848831177, + "num_tokens": 267352464.0, + "step": 10562 + }, + { + "epoch": 1.1600043927081045, + "grad_norm": 2.3623926639556885, + "learning_rate": 1e-06, + "loss": 0.8761, + "mean_token_accuracy": 0.7223660945892334, + "num_tokens": 267375915.0, + "step": 10563 + }, + { + "epoch": 1.1601142104107183, + "grad_norm": 2.2427239418029785, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7057902812957764, + "num_tokens": 267401680.0, + "step": 10564 + }, + { + "epoch": 1.1602240281133318, + "grad_norm": 2.2177605628967285, + "learning_rate": 1e-06, + "loss": 0.8901, + "mean_token_accuracy": 0.7203625440597534, + "num_tokens": 267428909.0, + "step": 10565 + }, + { + "epoch": 1.1603338458159456, + "grad_norm": 2.19384503364563, + "learning_rate": 1e-06, + "loss": 0.9937, + "mean_token_accuracy": 0.6935257911682129, + "num_tokens": 267456824.0, + "step": 10566 + }, + { + "epoch": 1.160443663518559, + "grad_norm": 2.1995437145233154, + "learning_rate": 1e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.721755862236023, + "num_tokens": 267485696.0, + "step": 10567 + }, + { + "epoch": 1.1605534812211729, + "grad_norm": 2.6366848945617676, + "learning_rate": 1e-06, + "loss": 0.8874, + "mean_token_accuracy": 0.7213062644004822, + "num_tokens": 267506108.0, + "step": 10568 + }, + { + "epoch": 1.1606632989237866, + "grad_norm": 2.1888492107391357, + "learning_rate": 1e-06, + "loss": 0.8874, + "mean_token_accuracy": 0.719376802444458, + "num_tokens": 267533118.0, + "step": 10569 + }, + { + "epoch": 1.1607731166264001, + "grad_norm": 2.601390838623047, + "learning_rate": 1e-06, + "loss": 0.9244, + "mean_token_accuracy": 0.7175142765045166, + "num_tokens": 267551798.0, + "step": 10570 + }, + { + "epoch": 1.160882934329014, + "grad_norm": 2.1164562702178955, + "learning_rate": 1e-06, + "loss": 0.8046, + "mean_token_accuracy": 0.747029721736908, + "num_tokens": 267578718.0, + "step": 10571 + }, + { + "epoch": 1.1609927520316274, + "grad_norm": 2.4027559757232666, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.6967252492904663, + "num_tokens": 267603426.0, + "step": 10572 + }, + { + "epoch": 1.1611025697342412, + "grad_norm": 2.1784582138061523, + "learning_rate": 1e-06, + "loss": 0.888, + "mean_token_accuracy": 0.7207316756248474, + "num_tokens": 267629710.0, + "step": 10573 + }, + { + "epoch": 1.1612123874368547, + "grad_norm": 2.31801700592041, + "learning_rate": 1e-06, + "loss": 0.763, + "mean_token_accuracy": 0.7496914267539978, + "num_tokens": 267652892.0, + "step": 10574 + }, + { + "epoch": 1.1613222051394685, + "grad_norm": 2.1524460315704346, + "learning_rate": 1e-06, + "loss": 0.8832, + "mean_token_accuracy": 0.7305105924606323, + "num_tokens": 267679947.0, + "step": 10575 + }, + { + "epoch": 1.161432022842082, + "grad_norm": 1.9531018733978271, + "learning_rate": 1e-06, + "loss": 0.9375, + "mean_token_accuracy": 0.7113828063011169, + "num_tokens": 267713642.0, + "step": 10576 + }, + { + "epoch": 1.1615418405446958, + "grad_norm": 2.2867140769958496, + "learning_rate": 1e-06, + "loss": 0.8295, + "mean_token_accuracy": 0.741938054561615, + "num_tokens": 267736632.0, + "step": 10577 + }, + { + "epoch": 1.1616516582473095, + "grad_norm": 2.424254894256592, + "learning_rate": 1e-06, + "loss": 0.8079, + "mean_token_accuracy": 0.7558208703994751, + "num_tokens": 267758713.0, + "step": 10578 + }, + { + "epoch": 1.161761475949923, + "grad_norm": 2.0694046020507812, + "learning_rate": 1e-06, + "loss": 0.9498, + "mean_token_accuracy": 0.7052909135818481, + "num_tokens": 267789128.0, + "step": 10579 + }, + { + "epoch": 1.1618712936525368, + "grad_norm": 2.278367757797241, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7135776281356812, + "num_tokens": 267816426.0, + "step": 10580 + }, + { + "epoch": 1.1619811113551504, + "grad_norm": 2.0427777767181396, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7081841230392456, + "num_tokens": 267847289.0, + "step": 10581 + }, + { + "epoch": 1.1620909290577641, + "grad_norm": 2.147087574005127, + "learning_rate": 1e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.722851037979126, + "num_tokens": 267872450.0, + "step": 10582 + }, + { + "epoch": 1.1622007467603779, + "grad_norm": 2.7788124084472656, + "learning_rate": 1e-06, + "loss": 0.7344, + "mean_token_accuracy": 0.7564432621002197, + "num_tokens": 267888973.0, + "step": 10583 + }, + { + "epoch": 1.1623105644629914, + "grad_norm": 2.5011303424835205, + "learning_rate": 1e-06, + "loss": 0.8563, + "mean_token_accuracy": 0.7261008024215698, + "num_tokens": 267911007.0, + "step": 10584 + }, + { + "epoch": 1.1624203821656052, + "grad_norm": 2.488978624343872, + "learning_rate": 1e-06, + "loss": 0.843, + "mean_token_accuracy": 0.7395755648612976, + "num_tokens": 267932367.0, + "step": 10585 + }, + { + "epoch": 1.1625301998682187, + "grad_norm": 2.303495407104492, + "learning_rate": 1e-06, + "loss": 1.0253, + "mean_token_accuracy": 0.6908699870109558, + "num_tokens": 267960128.0, + "step": 10586 + }, + { + "epoch": 1.1626400175708325, + "grad_norm": 2.2883944511413574, + "learning_rate": 1e-06, + "loss": 0.8124, + "mean_token_accuracy": 0.7420940399169922, + "num_tokens": 267982492.0, + "step": 10587 + }, + { + "epoch": 1.162749835273446, + "grad_norm": 2.3049468994140625, + "learning_rate": 1e-06, + "loss": 0.9407, + "mean_token_accuracy": 0.7111951112747192, + "num_tokens": 268009574.0, + "step": 10588 + }, + { + "epoch": 1.1628596529760598, + "grad_norm": 2.1946609020233154, + "learning_rate": 1e-06, + "loss": 0.8636, + "mean_token_accuracy": 0.7363325357437134, + "num_tokens": 268037638.0, + "step": 10589 + }, + { + "epoch": 1.1629694706786733, + "grad_norm": 2.319828748703003, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7010045051574707, + "num_tokens": 268062304.0, + "step": 10590 + }, + { + "epoch": 1.163079288381287, + "grad_norm": 2.2702198028564453, + "learning_rate": 1e-06, + "loss": 0.9308, + "mean_token_accuracy": 0.7092888355255127, + "num_tokens": 268086861.0, + "step": 10591 + }, + { + "epoch": 1.1631891060839008, + "grad_norm": 2.2799265384674072, + "learning_rate": 1e-06, + "loss": 0.8739, + "mean_token_accuracy": 0.721494197845459, + "num_tokens": 268111444.0, + "step": 10592 + }, + { + "epoch": 1.1632989237865143, + "grad_norm": 2.143458604812622, + "learning_rate": 1e-06, + "loss": 0.9075, + "mean_token_accuracy": 0.7208124995231628, + "num_tokens": 268138485.0, + "step": 10593 + }, + { + "epoch": 1.163408741489128, + "grad_norm": 2.2940220832824707, + "learning_rate": 1e-06, + "loss": 0.8593, + "mean_token_accuracy": 0.7392985820770264, + "num_tokens": 268161206.0, + "step": 10594 + }, + { + "epoch": 1.1635185591917416, + "grad_norm": 2.635068416595459, + "learning_rate": 1e-06, + "loss": 0.8178, + "mean_token_accuracy": 0.7514818906784058, + "num_tokens": 268180980.0, + "step": 10595 + }, + { + "epoch": 1.1636283768943554, + "grad_norm": 2.2568843364715576, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.705120325088501, + "num_tokens": 268205497.0, + "step": 10596 + }, + { + "epoch": 1.1637381945969691, + "grad_norm": 2.608865261077881, + "learning_rate": 1e-06, + "loss": 0.7888, + "mean_token_accuracy": 0.7457261681556702, + "num_tokens": 268224782.0, + "step": 10597 + }, + { + "epoch": 1.1638480122995827, + "grad_norm": 2.4150304794311523, + "learning_rate": 1e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.7126764059066772, + "num_tokens": 268246222.0, + "step": 10598 + }, + { + "epoch": 1.1639578300021964, + "grad_norm": 2.37870454788208, + "learning_rate": 1e-06, + "loss": 0.8964, + "mean_token_accuracy": 0.7185171246528625, + "num_tokens": 268269991.0, + "step": 10599 + }, + { + "epoch": 1.16406764770481, + "grad_norm": 2.501817226409912, + "learning_rate": 1e-06, + "loss": 0.8516, + "mean_token_accuracy": 0.7345200777053833, + "num_tokens": 268291955.0, + "step": 10600 + }, + { + "epoch": 1.1641774654074237, + "grad_norm": 2.538818359375, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7074040174484253, + "num_tokens": 268313200.0, + "step": 10601 + }, + { + "epoch": 1.1642872831100373, + "grad_norm": 2.2193915843963623, + "learning_rate": 1e-06, + "loss": 0.8358, + "mean_token_accuracy": 0.7441064119338989, + "num_tokens": 268338874.0, + "step": 10602 + }, + { + "epoch": 1.164397100812651, + "grad_norm": 2.111525297164917, + "learning_rate": 1e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.7059309482574463, + "num_tokens": 268368153.0, + "step": 10603 + }, + { + "epoch": 1.1645069185152646, + "grad_norm": 2.1237449645996094, + "learning_rate": 1e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7206407785415649, + "num_tokens": 268396339.0, + "step": 10604 + }, + { + "epoch": 1.1646167362178783, + "grad_norm": 2.1722118854522705, + "learning_rate": 1e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.7257930040359497, + "num_tokens": 268421927.0, + "step": 10605 + }, + { + "epoch": 1.164726553920492, + "grad_norm": 1.9905349016189575, + "learning_rate": 1e-06, + "loss": 0.7933, + "mean_token_accuracy": 0.7509884238243103, + "num_tokens": 268448702.0, + "step": 10606 + }, + { + "epoch": 1.1648363716231056, + "grad_norm": 2.605663537979126, + "learning_rate": 1e-06, + "loss": 0.8131, + "mean_token_accuracy": 0.7400908470153809, + "num_tokens": 268469264.0, + "step": 10607 + }, + { + "epoch": 1.1649461893257194, + "grad_norm": 2.1628646850585938, + "learning_rate": 1e-06, + "loss": 1.0314, + "mean_token_accuracy": 0.6881118416786194, + "num_tokens": 268497190.0, + "step": 10608 + }, + { + "epoch": 1.165056007028333, + "grad_norm": 2.246264934539795, + "learning_rate": 1e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.7048061490058899, + "num_tokens": 268522197.0, + "step": 10609 + }, + { + "epoch": 1.1651658247309467, + "grad_norm": 2.3749160766601562, + "learning_rate": 1e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.7267298698425293, + "num_tokens": 268544152.0, + "step": 10610 + }, + { + "epoch": 1.1652756424335604, + "grad_norm": 2.7095470428466797, + "learning_rate": 1e-06, + "loss": 0.7887, + "mean_token_accuracy": 0.7482912540435791, + "num_tokens": 268562762.0, + "step": 10611 + }, + { + "epoch": 1.165385460136174, + "grad_norm": 2.3501651287078857, + "learning_rate": 1e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.718865692615509, + "num_tokens": 268587587.0, + "step": 10612 + }, + { + "epoch": 1.1654952778387877, + "grad_norm": 2.599884510040283, + "learning_rate": 1e-06, + "loss": 0.8188, + "mean_token_accuracy": 0.7393825054168701, + "num_tokens": 268607301.0, + "step": 10613 + }, + { + "epoch": 1.1656050955414012, + "grad_norm": 2.173288345336914, + "learning_rate": 1e-06, + "loss": 0.8447, + "mean_token_accuracy": 0.7352945804595947, + "num_tokens": 268633652.0, + "step": 10614 + }, + { + "epoch": 1.165714913244015, + "grad_norm": 2.377415895462036, + "learning_rate": 1e-06, + "loss": 0.8794, + "mean_token_accuracy": 0.7266170978546143, + "num_tokens": 268656495.0, + "step": 10615 + }, + { + "epoch": 1.1658247309466285, + "grad_norm": 2.3235580921173096, + "learning_rate": 1e-06, + "loss": 0.8106, + "mean_token_accuracy": 0.7392860651016235, + "num_tokens": 268679964.0, + "step": 10616 + }, + { + "epoch": 1.1659345486492423, + "grad_norm": 2.431978464126587, + "learning_rate": 1e-06, + "loss": 0.8405, + "mean_token_accuracy": 0.7344928979873657, + "num_tokens": 268700904.0, + "step": 10617 + }, + { + "epoch": 1.1660443663518558, + "grad_norm": 2.51617431640625, + "learning_rate": 1e-06, + "loss": 0.858, + "mean_token_accuracy": 0.7334520816802979, + "num_tokens": 268721883.0, + "step": 10618 + }, + { + "epoch": 1.1661541840544696, + "grad_norm": 2.211775064468384, + "learning_rate": 1e-06, + "loss": 0.8093, + "mean_token_accuracy": 0.7430962920188904, + "num_tokens": 268749129.0, + "step": 10619 + }, + { + "epoch": 1.1662640017570833, + "grad_norm": 2.618100881576538, + "learning_rate": 1e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.7110731601715088, + "num_tokens": 268772315.0, + "step": 10620 + }, + { + "epoch": 1.1663738194596969, + "grad_norm": 2.261500120162964, + "learning_rate": 1e-06, + "loss": 0.8886, + "mean_token_accuracy": 0.7201542258262634, + "num_tokens": 268797952.0, + "step": 10621 + }, + { + "epoch": 1.1664836371623106, + "grad_norm": 2.060093879699707, + "learning_rate": 1e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.7300217151641846, + "num_tokens": 268826308.0, + "step": 10622 + }, + { + "epoch": 1.1665934548649242, + "grad_norm": 2.2920069694519043, + "learning_rate": 1e-06, + "loss": 0.8557, + "mean_token_accuracy": 0.7359198331832886, + "num_tokens": 268849756.0, + "step": 10623 + }, + { + "epoch": 1.166703272567538, + "grad_norm": 2.196375608444214, + "learning_rate": 1e-06, + "loss": 0.8966, + "mean_token_accuracy": 0.7193232774734497, + "num_tokens": 268875102.0, + "step": 10624 + }, + { + "epoch": 1.1668130902701515, + "grad_norm": 2.3437156677246094, + "learning_rate": 1e-06, + "loss": 0.844, + "mean_token_accuracy": 0.731198787689209, + "num_tokens": 268898657.0, + "step": 10625 + }, + { + "epoch": 1.1669229079727652, + "grad_norm": 2.2955570220947266, + "learning_rate": 1e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.6967267990112305, + "num_tokens": 268924605.0, + "step": 10626 + }, + { + "epoch": 1.167032725675379, + "grad_norm": 2.1941535472869873, + "learning_rate": 1e-06, + "loss": 0.929, + "mean_token_accuracy": 0.709244966506958, + "num_tokens": 268952892.0, + "step": 10627 + }, + { + "epoch": 1.1671425433779925, + "grad_norm": 2.083054542541504, + "learning_rate": 1e-06, + "loss": 0.9062, + "mean_token_accuracy": 0.724754273891449, + "num_tokens": 268981388.0, + "step": 10628 + }, + { + "epoch": 1.1672523610806063, + "grad_norm": 2.0980801582336426, + "learning_rate": 1e-06, + "loss": 0.9006, + "mean_token_accuracy": 0.7139512300491333, + "num_tokens": 269011257.0, + "step": 10629 + }, + { + "epoch": 1.1673621787832198, + "grad_norm": 2.6841230392456055, + "learning_rate": 1e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.7192178964614868, + "num_tokens": 269031264.0, + "step": 10630 + }, + { + "epoch": 1.1674719964858336, + "grad_norm": 2.2320942878723145, + "learning_rate": 1e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.7068535089492798, + "num_tokens": 269057981.0, + "step": 10631 + }, + { + "epoch": 1.167581814188447, + "grad_norm": 2.1943442821502686, + "learning_rate": 1e-06, + "loss": 0.8734, + "mean_token_accuracy": 0.7262952327728271, + "num_tokens": 269084261.0, + "step": 10632 + }, + { + "epoch": 1.1676916318910608, + "grad_norm": 2.2040953636169434, + "learning_rate": 1e-06, + "loss": 0.8434, + "mean_token_accuracy": 0.7326529026031494, + "num_tokens": 269110432.0, + "step": 10633 + }, + { + "epoch": 1.1678014495936746, + "grad_norm": 2.5229978561401367, + "learning_rate": 1e-06, + "loss": 0.8097, + "mean_token_accuracy": 0.7436660528182983, + "num_tokens": 269130300.0, + "step": 10634 + }, + { + "epoch": 1.1679112672962881, + "grad_norm": 2.414318084716797, + "learning_rate": 1e-06, + "loss": 0.8885, + "mean_token_accuracy": 0.7238585352897644, + "num_tokens": 269152168.0, + "step": 10635 + }, + { + "epoch": 1.168021084998902, + "grad_norm": 2.264145612716675, + "learning_rate": 1e-06, + "loss": 0.9853, + "mean_token_accuracy": 0.7028449773788452, + "num_tokens": 269179400.0, + "step": 10636 + }, + { + "epoch": 1.1681309027015154, + "grad_norm": 2.1361165046691895, + "learning_rate": 1e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7236683368682861, + "num_tokens": 269206749.0, + "step": 10637 + }, + { + "epoch": 1.1682407204041292, + "grad_norm": 2.0701496601104736, + "learning_rate": 1e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.7139896154403687, + "num_tokens": 269234632.0, + "step": 10638 + }, + { + "epoch": 1.1683505381067427, + "grad_norm": 2.616856098175049, + "learning_rate": 1e-06, + "loss": 0.8006, + "mean_token_accuracy": 0.7549681663513184, + "num_tokens": 269253812.0, + "step": 10639 + }, + { + "epoch": 1.1684603558093565, + "grad_norm": 2.3333847522735596, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.7027990221977234, + "num_tokens": 269278666.0, + "step": 10640 + }, + { + "epoch": 1.16857017351197, + "grad_norm": 2.2897324562072754, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7108186483383179, + "num_tokens": 269305166.0, + "step": 10641 + }, + { + "epoch": 1.1686799912145838, + "grad_norm": 2.668987989425659, + "learning_rate": 1e-06, + "loss": 0.9002, + "mean_token_accuracy": 0.7169473767280579, + "num_tokens": 269324142.0, + "step": 10642 + }, + { + "epoch": 1.1687898089171975, + "grad_norm": 2.0716307163238525, + "learning_rate": 1e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.708857536315918, + "num_tokens": 269355001.0, + "step": 10643 + }, + { + "epoch": 1.168899626619811, + "grad_norm": 2.2030892372131348, + "learning_rate": 1e-06, + "loss": 0.8593, + "mean_token_accuracy": 0.7255102396011353, + "num_tokens": 269379078.0, + "step": 10644 + }, + { + "epoch": 1.1690094443224248, + "grad_norm": 2.213628053665161, + "learning_rate": 1e-06, + "loss": 1.0597, + "mean_token_accuracy": 0.6839689016342163, + "num_tokens": 269409289.0, + "step": 10645 + }, + { + "epoch": 1.1691192620250384, + "grad_norm": 2.4812920093536377, + "learning_rate": 1e-06, + "loss": 0.8832, + "mean_token_accuracy": 0.7229627370834351, + "num_tokens": 269430338.0, + "step": 10646 + }, + { + "epoch": 1.169229079727652, + "grad_norm": 2.0579967498779297, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.7065513134002686, + "num_tokens": 269460463.0, + "step": 10647 + }, + { + "epoch": 1.1693388974302659, + "grad_norm": 2.2496895790100098, + "learning_rate": 1e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.7233575582504272, + "num_tokens": 269485772.0, + "step": 10648 + }, + { + "epoch": 1.1694487151328794, + "grad_norm": 2.407493829727173, + "learning_rate": 1e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.7142701745033264, + "num_tokens": 269509500.0, + "step": 10649 + }, + { + "epoch": 1.1695585328354932, + "grad_norm": 2.6262643337249756, + "learning_rate": 1e-06, + "loss": 0.7614, + "mean_token_accuracy": 0.7575547099113464, + "num_tokens": 269528884.0, + "step": 10650 + }, + { + "epoch": 1.1696683505381067, + "grad_norm": 2.075894594192505, + "learning_rate": 1e-06, + "loss": 0.898, + "mean_token_accuracy": 0.7133080959320068, + "num_tokens": 269559251.0, + "step": 10651 + }, + { + "epoch": 1.1697781682407205, + "grad_norm": 2.0531258583068848, + "learning_rate": 1e-06, + "loss": 0.859, + "mean_token_accuracy": 0.7289808988571167, + "num_tokens": 269589665.0, + "step": 10652 + }, + { + "epoch": 1.169887985943334, + "grad_norm": 1.9024008512496948, + "learning_rate": 1e-06, + "loss": 0.8671, + "mean_token_accuracy": 0.7312154769897461, + "num_tokens": 269623563.0, + "step": 10653 + }, + { + "epoch": 1.1699978036459477, + "grad_norm": 2.280935049057007, + "learning_rate": 1e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.71819007396698, + "num_tokens": 269650573.0, + "step": 10654 + }, + { + "epoch": 1.1701076213485613, + "grad_norm": 2.398639678955078, + "learning_rate": 1e-06, + "loss": 0.8389, + "mean_token_accuracy": 0.7450946569442749, + "num_tokens": 269673150.0, + "step": 10655 + }, + { + "epoch": 1.170217439051175, + "grad_norm": 2.182574987411499, + "learning_rate": 1e-06, + "loss": 0.9233, + "mean_token_accuracy": 0.7155010104179382, + "num_tokens": 269700442.0, + "step": 10656 + }, + { + "epoch": 1.1703272567537888, + "grad_norm": 2.2643022537231445, + "learning_rate": 1e-06, + "loss": 0.8738, + "mean_token_accuracy": 0.7265787124633789, + "num_tokens": 269724082.0, + "step": 10657 + }, + { + "epoch": 1.1704370744564023, + "grad_norm": 2.347869634628296, + "learning_rate": 1e-06, + "loss": 0.8645, + "mean_token_accuracy": 0.7380636930465698, + "num_tokens": 269746393.0, + "step": 10658 + }, + { + "epoch": 1.170546892159016, + "grad_norm": 2.706023931503296, + "learning_rate": 1e-06, + "loss": 0.9525, + "mean_token_accuracy": 0.7056113481521606, + "num_tokens": 269768171.0, + "step": 10659 + }, + { + "epoch": 1.1706567098616296, + "grad_norm": 2.499116897583008, + "learning_rate": 1e-06, + "loss": 0.8574, + "mean_token_accuracy": 0.7325382232666016, + "num_tokens": 269788390.0, + "step": 10660 + }, + { + "epoch": 1.1707665275642434, + "grad_norm": 2.2847888469696045, + "learning_rate": 1e-06, + "loss": 0.866, + "mean_token_accuracy": 0.7295522689819336, + "num_tokens": 269812250.0, + "step": 10661 + }, + { + "epoch": 1.1708763452668571, + "grad_norm": 2.2041051387786865, + "learning_rate": 1e-06, + "loss": 0.8875, + "mean_token_accuracy": 0.721822202205658, + "num_tokens": 269837117.0, + "step": 10662 + }, + { + "epoch": 1.1709861629694707, + "grad_norm": 2.211324453353882, + "learning_rate": 1e-06, + "loss": 0.9441, + "mean_token_accuracy": 0.7118399143218994, + "num_tokens": 269862347.0, + "step": 10663 + }, + { + "epoch": 1.1710959806720844, + "grad_norm": 2.868237018585205, + "learning_rate": 1e-06, + "loss": 0.8507, + "mean_token_accuracy": 0.7339814305305481, + "num_tokens": 269880708.0, + "step": 10664 + }, + { + "epoch": 1.171205798374698, + "grad_norm": 2.4275662899017334, + "learning_rate": 1e-06, + "loss": 0.8483, + "mean_token_accuracy": 0.7285891771316528, + "num_tokens": 269901882.0, + "step": 10665 + }, + { + "epoch": 1.1713156160773117, + "grad_norm": 2.1328649520874023, + "learning_rate": 1e-06, + "loss": 1.041, + "mean_token_accuracy": 0.6860171556472778, + "num_tokens": 269930716.0, + "step": 10666 + }, + { + "epoch": 1.1714254337799253, + "grad_norm": 2.1878859996795654, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.7147143483161926, + "num_tokens": 269960041.0, + "step": 10667 + }, + { + "epoch": 1.171535251482539, + "grad_norm": 2.3356096744537354, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.7134034037590027, + "num_tokens": 269985148.0, + "step": 10668 + }, + { + "epoch": 1.1716450691851525, + "grad_norm": 2.21781587600708, + "learning_rate": 1e-06, + "loss": 0.8748, + "mean_token_accuracy": 0.7321696877479553, + "num_tokens": 270011611.0, + "step": 10669 + }, + { + "epoch": 1.1717548868877663, + "grad_norm": 2.2262327671051025, + "learning_rate": 1e-06, + "loss": 0.8309, + "mean_token_accuracy": 0.7347628474235535, + "num_tokens": 270035521.0, + "step": 10670 + }, + { + "epoch": 1.17186470459038, + "grad_norm": 2.505218029022217, + "learning_rate": 1e-06, + "loss": 0.823, + "mean_token_accuracy": 0.7424857020378113, + "num_tokens": 270056092.0, + "step": 10671 + }, + { + "epoch": 1.1719745222929936, + "grad_norm": 2.486388921737671, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.7184274196624756, + "num_tokens": 270079129.0, + "step": 10672 + }, + { + "epoch": 1.1720843399956073, + "grad_norm": 2.114352226257324, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7075924873352051, + "num_tokens": 270107281.0, + "step": 10673 + }, + { + "epoch": 1.1721941576982209, + "grad_norm": 1.959537148475647, + "learning_rate": 1e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.727543830871582, + "num_tokens": 270137817.0, + "step": 10674 + }, + { + "epoch": 1.1723039754008346, + "grad_norm": 2.281985282897949, + "learning_rate": 1e-06, + "loss": 0.9089, + "mean_token_accuracy": 0.7097539901733398, + "num_tokens": 270162184.0, + "step": 10675 + }, + { + "epoch": 1.1724137931034484, + "grad_norm": 2.456758737564087, + "learning_rate": 1e-06, + "loss": 0.7451, + "mean_token_accuracy": 0.7699657678604126, + "num_tokens": 270182101.0, + "step": 10676 + }, + { + "epoch": 1.172523610806062, + "grad_norm": 2.068446397781372, + "learning_rate": 1e-06, + "loss": 0.946, + "mean_token_accuracy": 0.7136554718017578, + "num_tokens": 270213655.0, + "step": 10677 + }, + { + "epoch": 1.1726334285086757, + "grad_norm": 2.438722848892212, + "learning_rate": 1e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.710875391960144, + "num_tokens": 270237228.0, + "step": 10678 + }, + { + "epoch": 1.1727432462112892, + "grad_norm": 2.3922245502471924, + "learning_rate": 1e-06, + "loss": 0.8978, + "mean_token_accuracy": 0.7197924852371216, + "num_tokens": 270261092.0, + "step": 10679 + }, + { + "epoch": 1.172853063913903, + "grad_norm": 2.3135275840759277, + "learning_rate": 1e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7065746784210205, + "num_tokens": 270287365.0, + "step": 10680 + }, + { + "epoch": 1.1729628816165165, + "grad_norm": 2.420334815979004, + "learning_rate": 1e-06, + "loss": 0.9049, + "mean_token_accuracy": 0.7330278754234314, + "num_tokens": 270312214.0, + "step": 10681 + }, + { + "epoch": 1.1730726993191303, + "grad_norm": 2.594602346420288, + "learning_rate": 1e-06, + "loss": 0.7791, + "mean_token_accuracy": 0.7472623586654663, + "num_tokens": 270330831.0, + "step": 10682 + }, + { + "epoch": 1.1731825170217438, + "grad_norm": 2.1421353816986084, + "learning_rate": 1e-06, + "loss": 0.8771, + "mean_token_accuracy": 0.7188917398452759, + "num_tokens": 270357376.0, + "step": 10683 + }, + { + "epoch": 1.1732923347243576, + "grad_norm": 2.2784619331359863, + "learning_rate": 1e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.7104502320289612, + "num_tokens": 270382775.0, + "step": 10684 + }, + { + "epoch": 1.1734021524269713, + "grad_norm": 2.297036647796631, + "learning_rate": 1e-06, + "loss": 0.9012, + "mean_token_accuracy": 0.7218159437179565, + "num_tokens": 270409412.0, + "step": 10685 + }, + { + "epoch": 1.1735119701295849, + "grad_norm": 2.383975028991699, + "learning_rate": 1e-06, + "loss": 0.9368, + "mean_token_accuracy": 0.7047955989837646, + "num_tokens": 270433596.0, + "step": 10686 + }, + { + "epoch": 1.1736217878321986, + "grad_norm": 2.30436635017395, + "learning_rate": 1e-06, + "loss": 0.8727, + "mean_token_accuracy": 0.7291260957717896, + "num_tokens": 270455829.0, + "step": 10687 + }, + { + "epoch": 1.1737316055348122, + "grad_norm": 2.1872313022613525, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.6946291923522949, + "num_tokens": 270482631.0, + "step": 10688 + }, + { + "epoch": 1.173841423237426, + "grad_norm": 2.1162590980529785, + "learning_rate": 1e-06, + "loss": 0.7774, + "mean_token_accuracy": 0.7520900964736938, + "num_tokens": 270507214.0, + "step": 10689 + }, + { + "epoch": 1.1739512409400394, + "grad_norm": 2.3532443046569824, + "learning_rate": 1e-06, + "loss": 0.8775, + "mean_token_accuracy": 0.731059193611145, + "num_tokens": 270530363.0, + "step": 10690 + }, + { + "epoch": 1.1740610586426532, + "grad_norm": 2.0680489540100098, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.705693244934082, + "num_tokens": 270558205.0, + "step": 10691 + }, + { + "epoch": 1.1741708763452667, + "grad_norm": 2.342588424682617, + "learning_rate": 1e-06, + "loss": 0.8796, + "mean_token_accuracy": 0.7360415458679199, + "num_tokens": 270582267.0, + "step": 10692 + }, + { + "epoch": 1.1742806940478805, + "grad_norm": 2.5326058864593506, + "learning_rate": 1e-06, + "loss": 0.9007, + "mean_token_accuracy": 0.7455179691314697, + "num_tokens": 270603695.0, + "step": 10693 + }, + { + "epoch": 1.1743905117504942, + "grad_norm": 2.3393678665161133, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7181159257888794, + "num_tokens": 270628568.0, + "step": 10694 + }, + { + "epoch": 1.1745003294531078, + "grad_norm": 2.260770082473755, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.7041536569595337, + "num_tokens": 270656167.0, + "step": 10695 + }, + { + "epoch": 1.1746101471557215, + "grad_norm": 2.4306342601776123, + "learning_rate": 1e-06, + "loss": 0.937, + "mean_token_accuracy": 0.7061449289321899, + "num_tokens": 270680216.0, + "step": 10696 + }, + { + "epoch": 1.174719964858335, + "grad_norm": 1.9705835580825806, + "learning_rate": 1e-06, + "loss": 1.0, + "mean_token_accuracy": 0.6922532320022583, + "num_tokens": 270711807.0, + "step": 10697 + }, + { + "epoch": 1.1748297825609488, + "grad_norm": 1.9017987251281738, + "learning_rate": 1e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.7169885039329529, + "num_tokens": 270746994.0, + "step": 10698 + }, + { + "epoch": 1.1749396002635626, + "grad_norm": 2.078993558883667, + "learning_rate": 1e-06, + "loss": 0.911, + "mean_token_accuracy": 0.7175887227058411, + "num_tokens": 270778000.0, + "step": 10699 + }, + { + "epoch": 1.1750494179661761, + "grad_norm": 2.1279616355895996, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.7055094838142395, + "num_tokens": 270805195.0, + "step": 10700 + }, + { + "epoch": 1.1751592356687899, + "grad_norm": 2.3623602390289307, + "learning_rate": 1e-06, + "loss": 0.8394, + "mean_token_accuracy": 0.7307654619216919, + "num_tokens": 270828166.0, + "step": 10701 + }, + { + "epoch": 1.1752690533714034, + "grad_norm": 2.371321201324463, + "learning_rate": 1e-06, + "loss": 0.8035, + "mean_token_accuracy": 0.7460646629333496, + "num_tokens": 270849559.0, + "step": 10702 + }, + { + "epoch": 1.1753788710740172, + "grad_norm": 2.1285159587860107, + "learning_rate": 1e-06, + "loss": 0.8692, + "mean_token_accuracy": 0.72803795337677, + "num_tokens": 270876215.0, + "step": 10703 + }, + { + "epoch": 1.1754886887766307, + "grad_norm": 2.1944496631622314, + "learning_rate": 1e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.6970348954200745, + "num_tokens": 270903296.0, + "step": 10704 + }, + { + "epoch": 1.1755985064792445, + "grad_norm": 2.451817750930786, + "learning_rate": 1e-06, + "loss": 0.8743, + "mean_token_accuracy": 0.7247269153594971, + "num_tokens": 270926448.0, + "step": 10705 + }, + { + "epoch": 1.175708324181858, + "grad_norm": 2.150861978530884, + "learning_rate": 1e-06, + "loss": 0.8453, + "mean_token_accuracy": 0.7432166934013367, + "num_tokens": 270953373.0, + "step": 10706 + }, + { + "epoch": 1.1758181418844718, + "grad_norm": 2.1408464908599854, + "learning_rate": 1e-06, + "loss": 0.8828, + "mean_token_accuracy": 0.7208043932914734, + "num_tokens": 270981016.0, + "step": 10707 + }, + { + "epoch": 1.1759279595870855, + "grad_norm": 2.3694310188293457, + "learning_rate": 1e-06, + "loss": 0.7745, + "mean_token_accuracy": 0.7488614916801453, + "num_tokens": 271001591.0, + "step": 10708 + }, + { + "epoch": 1.176037777289699, + "grad_norm": 2.626208543777466, + "learning_rate": 1e-06, + "loss": 0.8849, + "mean_token_accuracy": 0.7230224609375, + "num_tokens": 271021565.0, + "step": 10709 + }, + { + "epoch": 1.1761475949923128, + "grad_norm": 2.1265227794647217, + "learning_rate": 1e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.7016432285308838, + "num_tokens": 271049955.0, + "step": 10710 + }, + { + "epoch": 1.1762574126949263, + "grad_norm": 1.9467604160308838, + "learning_rate": 1e-06, + "loss": 0.8862, + "mean_token_accuracy": 0.7229396104812622, + "num_tokens": 271080528.0, + "step": 10711 + }, + { + "epoch": 1.17636723039754, + "grad_norm": 2.6145124435424805, + "learning_rate": 1e-06, + "loss": 0.8523, + "mean_token_accuracy": 0.7343660593032837, + "num_tokens": 271099799.0, + "step": 10712 + }, + { + "epoch": 1.1764770481001539, + "grad_norm": 2.095627546310425, + "learning_rate": 1e-06, + "loss": 0.8415, + "mean_token_accuracy": 0.7298882603645325, + "num_tokens": 271128463.0, + "step": 10713 + }, + { + "epoch": 1.1765868658027674, + "grad_norm": 2.830599784851074, + "learning_rate": 1e-06, + "loss": 0.8531, + "mean_token_accuracy": 0.7275261878967285, + "num_tokens": 271146542.0, + "step": 10714 + }, + { + "epoch": 1.1766966835053811, + "grad_norm": 2.041649103164673, + "learning_rate": 1e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.7095630764961243, + "num_tokens": 271175733.0, + "step": 10715 + }, + { + "epoch": 1.1768065012079947, + "grad_norm": 2.3631906509399414, + "learning_rate": 1e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.7187876105308533, + "num_tokens": 271200622.0, + "step": 10716 + }, + { + "epoch": 1.1769163189106084, + "grad_norm": 2.2547969818115234, + "learning_rate": 1e-06, + "loss": 0.9213, + "mean_token_accuracy": 0.7174651622772217, + "num_tokens": 271225844.0, + "step": 10717 + }, + { + "epoch": 1.177026136613222, + "grad_norm": 2.789813995361328, + "learning_rate": 1e-06, + "loss": 0.8831, + "mean_token_accuracy": 0.7255555987358093, + "num_tokens": 271244910.0, + "step": 10718 + }, + { + "epoch": 1.1771359543158357, + "grad_norm": 2.414973020553589, + "learning_rate": 1e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7193450331687927, + "num_tokens": 271268608.0, + "step": 10719 + }, + { + "epoch": 1.1772457720184493, + "grad_norm": 2.268415927886963, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7066506147384644, + "num_tokens": 271294238.0, + "step": 10720 + }, + { + "epoch": 1.177355589721063, + "grad_norm": 2.3691656589508057, + "learning_rate": 1e-06, + "loss": 0.8698, + "mean_token_accuracy": 0.723340630531311, + "num_tokens": 271316235.0, + "step": 10721 + }, + { + "epoch": 1.1774654074236768, + "grad_norm": 2.3221004009246826, + "learning_rate": 1e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.6974142789840698, + "num_tokens": 271342311.0, + "step": 10722 + }, + { + "epoch": 1.1775752251262903, + "grad_norm": 2.295743942260742, + "learning_rate": 1e-06, + "loss": 0.8692, + "mean_token_accuracy": 0.7419922351837158, + "num_tokens": 271369089.0, + "step": 10723 + }, + { + "epoch": 1.177685042828904, + "grad_norm": 2.1278326511383057, + "learning_rate": 1e-06, + "loss": 0.8564, + "mean_token_accuracy": 0.728766918182373, + "num_tokens": 271397011.0, + "step": 10724 + }, + { + "epoch": 1.1777948605315176, + "grad_norm": 2.421332359313965, + "learning_rate": 1e-06, + "loss": 0.8593, + "mean_token_accuracy": 0.7277100086212158, + "num_tokens": 271420156.0, + "step": 10725 + }, + { + "epoch": 1.1779046782341314, + "grad_norm": 2.6563727855682373, + "learning_rate": 1e-06, + "loss": 0.823, + "mean_token_accuracy": 0.7421652674674988, + "num_tokens": 271439776.0, + "step": 10726 + }, + { + "epoch": 1.1780144959367451, + "grad_norm": 2.3238353729248047, + "learning_rate": 1e-06, + "loss": 0.9498, + "mean_token_accuracy": 0.7037978172302246, + "num_tokens": 271466591.0, + "step": 10727 + }, + { + "epoch": 1.1781243136393587, + "grad_norm": 2.1160900592803955, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7114375829696655, + "num_tokens": 271494903.0, + "step": 10728 + }, + { + "epoch": 1.1782341313419724, + "grad_norm": 2.208770751953125, + "learning_rate": 1e-06, + "loss": 0.7605, + "mean_token_accuracy": 0.761569082736969, + "num_tokens": 271520272.0, + "step": 10729 + }, + { + "epoch": 1.178343949044586, + "grad_norm": 2.1956584453582764, + "learning_rate": 1e-06, + "loss": 0.8091, + "mean_token_accuracy": 0.7533590793609619, + "num_tokens": 271544692.0, + "step": 10730 + }, + { + "epoch": 1.1784537667471997, + "grad_norm": 2.190073013305664, + "learning_rate": 1e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.726886510848999, + "num_tokens": 271571263.0, + "step": 10731 + }, + { + "epoch": 1.1785635844498132, + "grad_norm": 2.301241397857666, + "learning_rate": 1e-06, + "loss": 0.918, + "mean_token_accuracy": 0.7109472751617432, + "num_tokens": 271596450.0, + "step": 10732 + }, + { + "epoch": 1.178673402152427, + "grad_norm": 2.226623773574829, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7039013504981995, + "num_tokens": 271624822.0, + "step": 10733 + }, + { + "epoch": 1.1787832198550405, + "grad_norm": 2.3026819229125977, + "learning_rate": 1e-06, + "loss": 0.9039, + "mean_token_accuracy": 0.7154355645179749, + "num_tokens": 271650101.0, + "step": 10734 + }, + { + "epoch": 1.1788930375576543, + "grad_norm": 2.338519334793091, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7068008184432983, + "num_tokens": 271674805.0, + "step": 10735 + }, + { + "epoch": 1.179002855260268, + "grad_norm": 2.4703662395477295, + "learning_rate": 1e-06, + "loss": 0.9189, + "mean_token_accuracy": 0.7328047752380371, + "num_tokens": 271698484.0, + "step": 10736 + }, + { + "epoch": 1.1791126729628816, + "grad_norm": 2.2906882762908936, + "learning_rate": 1e-06, + "loss": 0.8911, + "mean_token_accuracy": 0.7225723266601562, + "num_tokens": 271722416.0, + "step": 10737 + }, + { + "epoch": 1.1792224906654953, + "grad_norm": 2.758823871612549, + "learning_rate": 1e-06, + "loss": 0.8804, + "mean_token_accuracy": 0.720039963722229, + "num_tokens": 271741369.0, + "step": 10738 + }, + { + "epoch": 1.1793323083681089, + "grad_norm": 2.05539608001709, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7150484323501587, + "num_tokens": 271769533.0, + "step": 10739 + }, + { + "epoch": 1.1794421260707226, + "grad_norm": 1.9482873678207397, + "learning_rate": 1e-06, + "loss": 0.8971, + "mean_token_accuracy": 0.7171491384506226, + "num_tokens": 271799674.0, + "step": 10740 + }, + { + "epoch": 1.1795519437733364, + "grad_norm": 2.102804183959961, + "learning_rate": 1e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.7053995132446289, + "num_tokens": 271829355.0, + "step": 10741 + }, + { + "epoch": 1.17966176147595, + "grad_norm": 2.4175682067871094, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7063594460487366, + "num_tokens": 271854485.0, + "step": 10742 + }, + { + "epoch": 1.1797715791785637, + "grad_norm": 2.3478448390960693, + "learning_rate": 1e-06, + "loss": 0.7927, + "mean_token_accuracy": 0.7451196312904358, + "num_tokens": 271876497.0, + "step": 10743 + }, + { + "epoch": 1.1798813968811772, + "grad_norm": 2.4313528537750244, + "learning_rate": 1e-06, + "loss": 0.907, + "mean_token_accuracy": 0.7137001752853394, + "num_tokens": 271900166.0, + "step": 10744 + }, + { + "epoch": 1.179991214583791, + "grad_norm": 2.554553747177124, + "learning_rate": 1e-06, + "loss": 0.8537, + "mean_token_accuracy": 0.7294318675994873, + "num_tokens": 271919943.0, + "step": 10745 + }, + { + "epoch": 1.1801010322864045, + "grad_norm": 2.437086820602417, + "learning_rate": 1e-06, + "loss": 0.8425, + "mean_token_accuracy": 0.7336630821228027, + "num_tokens": 271941837.0, + "step": 10746 + }, + { + "epoch": 1.1802108499890183, + "grad_norm": 2.01578688621521, + "learning_rate": 1e-06, + "loss": 0.964, + "mean_token_accuracy": 0.701657772064209, + "num_tokens": 271975209.0, + "step": 10747 + }, + { + "epoch": 1.1803206676916318, + "grad_norm": 1.9240180253982544, + "learning_rate": 1e-06, + "loss": 0.8911, + "mean_token_accuracy": 0.7236318588256836, + "num_tokens": 272006761.0, + "step": 10748 + }, + { + "epoch": 1.1804304853942456, + "grad_norm": 2.5053248405456543, + "learning_rate": 1e-06, + "loss": 0.7858, + "mean_token_accuracy": 0.7543676495552063, + "num_tokens": 272025202.0, + "step": 10749 + }, + { + "epoch": 1.1805403030968593, + "grad_norm": 2.245933771133423, + "learning_rate": 1e-06, + "loss": 0.8793, + "mean_token_accuracy": 0.71775221824646, + "num_tokens": 272050226.0, + "step": 10750 + }, + { + "epoch": 1.1806501207994728, + "grad_norm": 1.9732762575149536, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.6980922222137451, + "num_tokens": 272082802.0, + "step": 10751 + }, + { + "epoch": 1.1807599385020866, + "grad_norm": 2.486715078353882, + "learning_rate": 1e-06, + "loss": 0.8279, + "mean_token_accuracy": 0.7365531921386719, + "num_tokens": 272104394.0, + "step": 10752 + }, + { + "epoch": 1.1808697562047001, + "grad_norm": 2.3006818294525146, + "learning_rate": 1e-06, + "loss": 0.9449, + "mean_token_accuracy": 0.7088087797164917, + "num_tokens": 272128442.0, + "step": 10753 + }, + { + "epoch": 1.180979573907314, + "grad_norm": 2.2449951171875, + "learning_rate": 1e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.7022260427474976, + "num_tokens": 272155954.0, + "step": 10754 + }, + { + "epoch": 1.1810893916099274, + "grad_norm": 2.10050368309021, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7078284025192261, + "num_tokens": 272184906.0, + "step": 10755 + }, + { + "epoch": 1.1811992093125412, + "grad_norm": 2.3320486545562744, + "learning_rate": 1e-06, + "loss": 0.8736, + "mean_token_accuracy": 0.7279126644134521, + "num_tokens": 272210696.0, + "step": 10756 + }, + { + "epoch": 1.1813090270151547, + "grad_norm": 2.1546216011047363, + "learning_rate": 1e-06, + "loss": 0.8921, + "mean_token_accuracy": 0.7233476638793945, + "num_tokens": 272238588.0, + "step": 10757 + }, + { + "epoch": 1.1814188447177685, + "grad_norm": 2.3158388137817383, + "learning_rate": 1e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.736781120300293, + "num_tokens": 272263483.0, + "step": 10758 + }, + { + "epoch": 1.1815286624203822, + "grad_norm": 1.985916256904602, + "learning_rate": 1e-06, + "loss": 0.8467, + "mean_token_accuracy": 0.7350051403045654, + "num_tokens": 272293109.0, + "step": 10759 + }, + { + "epoch": 1.1816384801229958, + "grad_norm": 2.170443296432495, + "learning_rate": 1e-06, + "loss": 0.9933, + "mean_token_accuracy": 0.6938149929046631, + "num_tokens": 272320833.0, + "step": 10760 + }, + { + "epoch": 1.1817482978256095, + "grad_norm": 2.2219936847686768, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7051172852516174, + "num_tokens": 272348187.0, + "step": 10761 + }, + { + "epoch": 1.181858115528223, + "grad_norm": 1.9504328966140747, + "learning_rate": 1e-06, + "loss": 0.9042, + "mean_token_accuracy": 0.7173155546188354, + "num_tokens": 272380145.0, + "step": 10762 + }, + { + "epoch": 1.1819679332308368, + "grad_norm": 2.5489323139190674, + "learning_rate": 1e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.721408486366272, + "num_tokens": 272403064.0, + "step": 10763 + }, + { + "epoch": 1.1820777509334506, + "grad_norm": 2.253845691680908, + "learning_rate": 1e-06, + "loss": 0.8588, + "mean_token_accuracy": 0.7271190285682678, + "num_tokens": 272426874.0, + "step": 10764 + }, + { + "epoch": 1.1821875686360641, + "grad_norm": 2.3365774154663086, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7134273052215576, + "num_tokens": 272451650.0, + "step": 10765 + }, + { + "epoch": 1.1822973863386779, + "grad_norm": 2.4008381366729736, + "learning_rate": 1e-06, + "loss": 0.8327, + "mean_token_accuracy": 0.7325624227523804, + "num_tokens": 272473910.0, + "step": 10766 + }, + { + "epoch": 1.1824072040412914, + "grad_norm": 2.518099069595337, + "learning_rate": 1e-06, + "loss": 0.848, + "mean_token_accuracy": 0.7317809462547302, + "num_tokens": 272494349.0, + "step": 10767 + }, + { + "epoch": 1.1825170217439052, + "grad_norm": 2.406308174133301, + "learning_rate": 1e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7209782600402832, + "num_tokens": 272516277.0, + "step": 10768 + }, + { + "epoch": 1.1826268394465187, + "grad_norm": 2.2888855934143066, + "learning_rate": 1e-06, + "loss": 0.8759, + "mean_token_accuracy": 0.7298331260681152, + "num_tokens": 272539007.0, + "step": 10769 + }, + { + "epoch": 1.1827366571491325, + "grad_norm": 2.1402132511138916, + "learning_rate": 1e-06, + "loss": 0.882, + "mean_token_accuracy": 0.7182331085205078, + "num_tokens": 272566135.0, + "step": 10770 + }, + { + "epoch": 1.182846474851746, + "grad_norm": 2.264908790588379, + "learning_rate": 1e-06, + "loss": 0.8975, + "mean_token_accuracy": 0.7341606616973877, + "num_tokens": 272591291.0, + "step": 10771 + }, + { + "epoch": 1.1829562925543597, + "grad_norm": 2.00722599029541, + "learning_rate": 1e-06, + "loss": 0.8643, + "mean_token_accuracy": 0.7297632098197937, + "num_tokens": 272620679.0, + "step": 10772 + }, + { + "epoch": 1.1830661102569735, + "grad_norm": 2.528085231781006, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.711189866065979, + "num_tokens": 272640399.0, + "step": 10773 + }, + { + "epoch": 1.183175927959587, + "grad_norm": 2.2924466133117676, + "learning_rate": 1e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7152606248855591, + "num_tokens": 272669245.0, + "step": 10774 + }, + { + "epoch": 1.1832857456622008, + "grad_norm": 2.420487642288208, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.705064058303833, + "num_tokens": 272691925.0, + "step": 10775 + }, + { + "epoch": 1.1833955633648143, + "grad_norm": 1.9194971323013306, + "learning_rate": 1e-06, + "loss": 0.997, + "mean_token_accuracy": 0.6942665576934814, + "num_tokens": 272727131.0, + "step": 10776 + }, + { + "epoch": 1.183505381067428, + "grad_norm": 2.3610405921936035, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7119148373603821, + "num_tokens": 272750902.0, + "step": 10777 + }, + { + "epoch": 1.1836151987700418, + "grad_norm": 2.3947625160217285, + "learning_rate": 1e-06, + "loss": 0.9588, + "mean_token_accuracy": 0.701248049736023, + "num_tokens": 272774363.0, + "step": 10778 + }, + { + "epoch": 1.1837250164726554, + "grad_norm": 2.2297325134277344, + "learning_rate": 1e-06, + "loss": 0.8825, + "mean_token_accuracy": 0.7285608053207397, + "num_tokens": 272800109.0, + "step": 10779 + }, + { + "epoch": 1.1838348341752691, + "grad_norm": 2.0559353828430176, + "learning_rate": 1e-06, + "loss": 0.9225, + "mean_token_accuracy": 0.7250839471817017, + "num_tokens": 272830385.0, + "step": 10780 + }, + { + "epoch": 1.1839446518778827, + "grad_norm": 2.3849172592163086, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.7042315006256104, + "num_tokens": 272854707.0, + "step": 10781 + }, + { + "epoch": 1.1840544695804964, + "grad_norm": 2.0033483505249023, + "learning_rate": 1e-06, + "loss": 1.0149, + "mean_token_accuracy": 0.6901832222938538, + "num_tokens": 272888370.0, + "step": 10782 + }, + { + "epoch": 1.18416428728311, + "grad_norm": 2.2983977794647217, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.6974208950996399, + "num_tokens": 272914966.0, + "step": 10783 + }, + { + "epoch": 1.1842741049857237, + "grad_norm": 2.568387985229492, + "learning_rate": 1e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.7090123891830444, + "num_tokens": 272935613.0, + "step": 10784 + }, + { + "epoch": 1.1843839226883373, + "grad_norm": 2.260171413421631, + "learning_rate": 1e-06, + "loss": 0.8976, + "mean_token_accuracy": 0.7231128811836243, + "num_tokens": 272962016.0, + "step": 10785 + }, + { + "epoch": 1.184493740390951, + "grad_norm": 2.478348731994629, + "learning_rate": 1e-06, + "loss": 0.8938, + "mean_token_accuracy": 0.7204095125198364, + "num_tokens": 272983258.0, + "step": 10786 + }, + { + "epoch": 1.1846035580935648, + "grad_norm": 1.9597573280334473, + "learning_rate": 1e-06, + "loss": 0.8789, + "mean_token_accuracy": 0.72747802734375, + "num_tokens": 273016447.0, + "step": 10787 + }, + { + "epoch": 1.1847133757961783, + "grad_norm": 2.251321792602539, + "learning_rate": 1e-06, + "loss": 0.9104, + "mean_token_accuracy": 0.7146497964859009, + "num_tokens": 273041979.0, + "step": 10788 + }, + { + "epoch": 1.184823193498792, + "grad_norm": 2.3341526985168457, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7051982879638672, + "num_tokens": 273066854.0, + "step": 10789 + }, + { + "epoch": 1.1849330112014056, + "grad_norm": 2.4007842540740967, + "learning_rate": 1e-06, + "loss": 0.8654, + "mean_token_accuracy": 0.7334945201873779, + "num_tokens": 273089361.0, + "step": 10790 + }, + { + "epoch": 1.1850428289040194, + "grad_norm": 2.068373918533325, + "learning_rate": 1e-06, + "loss": 0.8984, + "mean_token_accuracy": 0.7154363393783569, + "num_tokens": 273118514.0, + "step": 10791 + }, + { + "epoch": 1.185152646606633, + "grad_norm": 2.371600866317749, + "learning_rate": 1e-06, + "loss": 0.8397, + "mean_token_accuracy": 0.7324795722961426, + "num_tokens": 273141182.0, + "step": 10792 + }, + { + "epoch": 1.1852624643092466, + "grad_norm": 2.236581325531006, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7062548995018005, + "num_tokens": 273166384.0, + "step": 10793 + }, + { + "epoch": 1.1853722820118604, + "grad_norm": 2.383665084838867, + "learning_rate": 1e-06, + "loss": 0.9209, + "mean_token_accuracy": 0.7130943536758423, + "num_tokens": 273189337.0, + "step": 10794 + }, + { + "epoch": 1.185482099714474, + "grad_norm": 2.398165702819824, + "learning_rate": 1e-06, + "loss": 0.8911, + "mean_token_accuracy": 0.7257157564163208, + "num_tokens": 273214780.0, + "step": 10795 + }, + { + "epoch": 1.1855919174170877, + "grad_norm": 2.188504934310913, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7148839235305786, + "num_tokens": 273240200.0, + "step": 10796 + }, + { + "epoch": 1.1857017351197012, + "grad_norm": 2.4239253997802734, + "learning_rate": 1e-06, + "loss": 0.8741, + "mean_token_accuracy": 0.7293685674667358, + "num_tokens": 273264694.0, + "step": 10797 + }, + { + "epoch": 1.185811552822315, + "grad_norm": 2.140106439590454, + "learning_rate": 1e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.7250096797943115, + "num_tokens": 273291836.0, + "step": 10798 + }, + { + "epoch": 1.1859213705249285, + "grad_norm": 2.269192934036255, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7055127024650574, + "num_tokens": 273318100.0, + "step": 10799 + }, + { + "epoch": 1.1860311882275423, + "grad_norm": 2.3700456619262695, + "learning_rate": 1e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.7194772362709045, + "num_tokens": 273343103.0, + "step": 10800 + }, + { + "epoch": 1.186141005930156, + "grad_norm": 2.1879143714904785, + "learning_rate": 1e-06, + "loss": 0.8357, + "mean_token_accuracy": 0.7332777976989746, + "num_tokens": 273369991.0, + "step": 10801 + }, + { + "epoch": 1.1862508236327696, + "grad_norm": 2.1927430629730225, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.6973668336868286, + "num_tokens": 273400036.0, + "step": 10802 + }, + { + "epoch": 1.1863606413353833, + "grad_norm": 2.4211747646331787, + "learning_rate": 1e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.7230228781700134, + "num_tokens": 273422945.0, + "step": 10803 + }, + { + "epoch": 1.1864704590379969, + "grad_norm": 2.0635364055633545, + "learning_rate": 1e-06, + "loss": 0.9773, + "mean_token_accuracy": 0.6967546939849854, + "num_tokens": 273453369.0, + "step": 10804 + }, + { + "epoch": 1.1865802767406106, + "grad_norm": 2.3201403617858887, + "learning_rate": 1e-06, + "loss": 0.8427, + "mean_token_accuracy": 0.7358171343803406, + "num_tokens": 273478395.0, + "step": 10805 + }, + { + "epoch": 1.1866900944432242, + "grad_norm": 2.3678207397460938, + "learning_rate": 1e-06, + "loss": 0.8639, + "mean_token_accuracy": 0.7343010902404785, + "num_tokens": 273500336.0, + "step": 10806 + }, + { + "epoch": 1.186799912145838, + "grad_norm": 2.0381274223327637, + "learning_rate": 1e-06, + "loss": 0.8557, + "mean_token_accuracy": 0.7345454096794128, + "num_tokens": 273531183.0, + "step": 10807 + }, + { + "epoch": 1.1869097298484517, + "grad_norm": 2.1380348205566406, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.7123666405677795, + "num_tokens": 273559825.0, + "step": 10808 + }, + { + "epoch": 1.1870195475510652, + "grad_norm": 2.532621383666992, + "learning_rate": 1e-06, + "loss": 0.7853, + "mean_token_accuracy": 0.7495065927505493, + "num_tokens": 273581065.0, + "step": 10809 + }, + { + "epoch": 1.187129365253679, + "grad_norm": 2.2672181129455566, + "learning_rate": 1e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.7315954566001892, + "num_tokens": 273606780.0, + "step": 10810 + }, + { + "epoch": 1.1872391829562925, + "grad_norm": 2.098345994949341, + "learning_rate": 1e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.7181700468063354, + "num_tokens": 273635456.0, + "step": 10811 + }, + { + "epoch": 1.1873490006589063, + "grad_norm": 2.2109930515289307, + "learning_rate": 1e-06, + "loss": 0.9375, + "mean_token_accuracy": 0.7056012153625488, + "num_tokens": 273663134.0, + "step": 10812 + }, + { + "epoch": 1.1874588183615198, + "grad_norm": 2.3651130199432373, + "learning_rate": 1e-06, + "loss": 0.8554, + "mean_token_accuracy": 0.7335373163223267, + "num_tokens": 273686888.0, + "step": 10813 + }, + { + "epoch": 1.1875686360641335, + "grad_norm": 2.2693116664886475, + "learning_rate": 1e-06, + "loss": 0.8567, + "mean_token_accuracy": 0.7363046407699585, + "num_tokens": 273710802.0, + "step": 10814 + }, + { + "epoch": 1.1876784537667473, + "grad_norm": 2.499406576156616, + "learning_rate": 1e-06, + "loss": 0.8803, + "mean_token_accuracy": 0.719617486000061, + "num_tokens": 273731566.0, + "step": 10815 + }, + { + "epoch": 1.1877882714693608, + "grad_norm": 2.416602849960327, + "learning_rate": 1e-06, + "loss": 0.8575, + "mean_token_accuracy": 0.741683304309845, + "num_tokens": 273753534.0, + "step": 10816 + }, + { + "epoch": 1.1878980891719746, + "grad_norm": 2.5267319679260254, + "learning_rate": 1e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.7133331894874573, + "num_tokens": 273774557.0, + "step": 10817 + }, + { + "epoch": 1.1880079068745881, + "grad_norm": 2.3614039421081543, + "learning_rate": 1e-06, + "loss": 0.926, + "mean_token_accuracy": 0.7216678857803345, + "num_tokens": 273799198.0, + "step": 10818 + }, + { + "epoch": 1.1881177245772019, + "grad_norm": 2.3531415462493896, + "learning_rate": 1e-06, + "loss": 0.8902, + "mean_token_accuracy": 0.7237002849578857, + "num_tokens": 273823116.0, + "step": 10819 + }, + { + "epoch": 1.1882275422798154, + "grad_norm": 2.3419511318206787, + "learning_rate": 1e-06, + "loss": 0.8269, + "mean_token_accuracy": 0.7424426674842834, + "num_tokens": 273845883.0, + "step": 10820 + }, + { + "epoch": 1.1883373599824292, + "grad_norm": 1.8826156854629517, + "learning_rate": 1e-06, + "loss": 0.9254, + "mean_token_accuracy": 0.7077735662460327, + "num_tokens": 273880341.0, + "step": 10821 + }, + { + "epoch": 1.1884471776850427, + "grad_norm": 2.1217384338378906, + "learning_rate": 1e-06, + "loss": 0.8782, + "mean_token_accuracy": 0.7266151309013367, + "num_tokens": 273906326.0, + "step": 10822 + }, + { + "epoch": 1.1885569953876565, + "grad_norm": 2.4576034545898438, + "learning_rate": 1e-06, + "loss": 0.8365, + "mean_token_accuracy": 0.7382404208183289, + "num_tokens": 273927883.0, + "step": 10823 + }, + { + "epoch": 1.1886668130902702, + "grad_norm": 2.025359869003296, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7111703157424927, + "num_tokens": 273958643.0, + "step": 10824 + }, + { + "epoch": 1.1887766307928838, + "grad_norm": 2.23527455329895, + "learning_rate": 1e-06, + "loss": 0.8298, + "mean_token_accuracy": 0.744359016418457, + "num_tokens": 273982379.0, + "step": 10825 + }, + { + "epoch": 1.1888864484954975, + "grad_norm": 2.5434458255767822, + "learning_rate": 1e-06, + "loss": 0.7852, + "mean_token_accuracy": 0.7462902069091797, + "num_tokens": 274001441.0, + "step": 10826 + }, + { + "epoch": 1.188996266198111, + "grad_norm": 2.418250322341919, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.7054612636566162, + "num_tokens": 274023967.0, + "step": 10827 + }, + { + "epoch": 1.1891060839007248, + "grad_norm": 2.702430248260498, + "learning_rate": 1e-06, + "loss": 0.8107, + "mean_token_accuracy": 0.7421807050704956, + "num_tokens": 274042980.0, + "step": 10828 + }, + { + "epoch": 1.1892159016033386, + "grad_norm": 2.3584647178649902, + "learning_rate": 1e-06, + "loss": 0.8146, + "mean_token_accuracy": 0.7422507405281067, + "num_tokens": 274065271.0, + "step": 10829 + }, + { + "epoch": 1.189325719305952, + "grad_norm": 1.9361447095870972, + "learning_rate": 1e-06, + "loss": 0.944, + "mean_token_accuracy": 0.7053978443145752, + "num_tokens": 274097662.0, + "step": 10830 + }, + { + "epoch": 1.1894355370085659, + "grad_norm": 1.9684785604476929, + "learning_rate": 1e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7121058106422424, + "num_tokens": 274130717.0, + "step": 10831 + }, + { + "epoch": 1.1895453547111794, + "grad_norm": 2.360982894897461, + "learning_rate": 1e-06, + "loss": 0.9958, + "mean_token_accuracy": 0.6922547817230225, + "num_tokens": 274154961.0, + "step": 10832 + }, + { + "epoch": 1.1896551724137931, + "grad_norm": 1.9925024509429932, + "learning_rate": 1e-06, + "loss": 0.9051, + "mean_token_accuracy": 0.716235876083374, + "num_tokens": 274188545.0, + "step": 10833 + }, + { + "epoch": 1.1897649901164067, + "grad_norm": 2.298079490661621, + "learning_rate": 1e-06, + "loss": 0.9206, + "mean_token_accuracy": 0.7163695096969604, + "num_tokens": 274212038.0, + "step": 10834 + }, + { + "epoch": 1.1898748078190204, + "grad_norm": 2.3621668815612793, + "learning_rate": 1e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.7141844034194946, + "num_tokens": 274235615.0, + "step": 10835 + }, + { + "epoch": 1.189984625521634, + "grad_norm": 2.0658655166625977, + "learning_rate": 1e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.6888180375099182, + "num_tokens": 274268596.0, + "step": 10836 + }, + { + "epoch": 1.1900944432242477, + "grad_norm": 2.3030343055725098, + "learning_rate": 1e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.7233991622924805, + "num_tokens": 274295530.0, + "step": 10837 + }, + { + "epoch": 1.1902042609268615, + "grad_norm": 2.518954277038574, + "learning_rate": 1e-06, + "loss": 0.879, + "mean_token_accuracy": 0.7265534400939941, + "num_tokens": 274316510.0, + "step": 10838 + }, + { + "epoch": 1.190314078629475, + "grad_norm": 2.178203582763672, + "learning_rate": 1e-06, + "loss": 0.9473, + "mean_token_accuracy": 0.7115267515182495, + "num_tokens": 274343056.0, + "step": 10839 + }, + { + "epoch": 1.1904238963320888, + "grad_norm": 2.171802520751953, + "learning_rate": 1e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.7069447040557861, + "num_tokens": 274371757.0, + "step": 10840 + }, + { + "epoch": 1.1905337140347023, + "grad_norm": 2.159168004989624, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.716396689414978, + "num_tokens": 274399005.0, + "step": 10841 + }, + { + "epoch": 1.190643531737316, + "grad_norm": 2.2937774658203125, + "learning_rate": 1e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.7223442792892456, + "num_tokens": 274422916.0, + "step": 10842 + }, + { + "epoch": 1.1907533494399298, + "grad_norm": 2.259880304336548, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7105343341827393, + "num_tokens": 274447365.0, + "step": 10843 + }, + { + "epoch": 1.1908631671425434, + "grad_norm": 2.1377196311950684, + "learning_rate": 1e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.7000833749771118, + "num_tokens": 274476298.0, + "step": 10844 + }, + { + "epoch": 1.1909729848451571, + "grad_norm": 2.0299389362335205, + "learning_rate": 1e-06, + "loss": 0.9131, + "mean_token_accuracy": 0.7178515195846558, + "num_tokens": 274506036.0, + "step": 10845 + }, + { + "epoch": 1.1910828025477707, + "grad_norm": 2.105238437652588, + "learning_rate": 1e-06, + "loss": 0.833, + "mean_token_accuracy": 0.7426921129226685, + "num_tokens": 274531751.0, + "step": 10846 + }, + { + "epoch": 1.1911926202503844, + "grad_norm": 2.0799612998962402, + "learning_rate": 1e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.724303126335144, + "num_tokens": 274561344.0, + "step": 10847 + }, + { + "epoch": 1.191302437952998, + "grad_norm": 2.1860897541046143, + "learning_rate": 1e-06, + "loss": 0.8668, + "mean_token_accuracy": 0.7351738810539246, + "num_tokens": 274587956.0, + "step": 10848 + }, + { + "epoch": 1.1914122556556117, + "grad_norm": 2.34922456741333, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7142643928527832, + "num_tokens": 274611718.0, + "step": 10849 + }, + { + "epoch": 1.1915220733582252, + "grad_norm": 2.1805858612060547, + "learning_rate": 1e-06, + "loss": 0.8224, + "mean_token_accuracy": 0.7410473823547363, + "num_tokens": 274636567.0, + "step": 10850 + }, + { + "epoch": 1.191631891060839, + "grad_norm": 2.180009365081787, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.705707311630249, + "num_tokens": 274665625.0, + "step": 10851 + }, + { + "epoch": 1.1917417087634528, + "grad_norm": 2.2978529930114746, + "learning_rate": 1e-06, + "loss": 0.9493, + "mean_token_accuracy": 0.7084190845489502, + "num_tokens": 274690797.0, + "step": 10852 + }, + { + "epoch": 1.1918515264660663, + "grad_norm": 2.346238613128662, + "learning_rate": 1e-06, + "loss": 0.853, + "mean_token_accuracy": 0.7315898537635803, + "num_tokens": 274714447.0, + "step": 10853 + }, + { + "epoch": 1.19196134416868, + "grad_norm": 2.2718777656555176, + "learning_rate": 1e-06, + "loss": 0.8731, + "mean_token_accuracy": 0.7284035682678223, + "num_tokens": 274739857.0, + "step": 10854 + }, + { + "epoch": 1.1920711618712936, + "grad_norm": 1.9679237604141235, + "learning_rate": 1e-06, + "loss": 0.8688, + "mean_token_accuracy": 0.7264628410339355, + "num_tokens": 274768374.0, + "step": 10855 + }, + { + "epoch": 1.1921809795739073, + "grad_norm": 2.236325263977051, + "learning_rate": 1e-06, + "loss": 0.8771, + "mean_token_accuracy": 0.7264183759689331, + "num_tokens": 274795266.0, + "step": 10856 + }, + { + "epoch": 1.192290797276521, + "grad_norm": 2.0445213317871094, + "learning_rate": 1e-06, + "loss": 1.0312, + "mean_token_accuracy": 0.6824004650115967, + "num_tokens": 274826066.0, + "step": 10857 + }, + { + "epoch": 1.1924006149791346, + "grad_norm": 2.390031337738037, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7143183350563049, + "num_tokens": 274849171.0, + "step": 10858 + }, + { + "epoch": 1.1925104326817484, + "grad_norm": 2.4281277656555176, + "learning_rate": 1e-06, + "loss": 0.8113, + "mean_token_accuracy": 0.7506670951843262, + "num_tokens": 274869257.0, + "step": 10859 + }, + { + "epoch": 1.192620250384362, + "grad_norm": 2.330249309539795, + "learning_rate": 1e-06, + "loss": 1.0255, + "mean_token_accuracy": 0.6842455863952637, + "num_tokens": 274896523.0, + "step": 10860 + }, + { + "epoch": 1.1927300680869757, + "grad_norm": 2.296926736831665, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.7175570130348206, + "num_tokens": 274920974.0, + "step": 10861 + }, + { + "epoch": 1.1928398857895892, + "grad_norm": 2.104816198348999, + "learning_rate": 1e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.6996557116508484, + "num_tokens": 274948165.0, + "step": 10862 + }, + { + "epoch": 1.192949703492203, + "grad_norm": 2.463284492492676, + "learning_rate": 1e-06, + "loss": 0.9205, + "mean_token_accuracy": 0.7111512422561646, + "num_tokens": 274970371.0, + "step": 10863 + }, + { + "epoch": 1.1930595211948165, + "grad_norm": 2.3503005504608154, + "learning_rate": 1e-06, + "loss": 0.8521, + "mean_token_accuracy": 0.7399965524673462, + "num_tokens": 274992832.0, + "step": 10864 + }, + { + "epoch": 1.1931693388974303, + "grad_norm": 2.1688785552978516, + "learning_rate": 1e-06, + "loss": 0.8702, + "mean_token_accuracy": 0.7328503131866455, + "num_tokens": 275018492.0, + "step": 10865 + }, + { + "epoch": 1.193279156600044, + "grad_norm": 2.038092851638794, + "learning_rate": 1e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7101887464523315, + "num_tokens": 275049156.0, + "step": 10866 + }, + { + "epoch": 1.1933889743026576, + "grad_norm": 2.058695077896118, + "learning_rate": 1e-06, + "loss": 0.9815, + "mean_token_accuracy": 0.6973238587379456, + "num_tokens": 275080180.0, + "step": 10867 + }, + { + "epoch": 1.1934987920052713, + "grad_norm": 2.260988712310791, + "learning_rate": 1e-06, + "loss": 0.9546, + "mean_token_accuracy": 0.7122598886489868, + "num_tokens": 275108624.0, + "step": 10868 + }, + { + "epoch": 1.1936086097078848, + "grad_norm": 2.433180570602417, + "learning_rate": 1e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.7220090627670288, + "num_tokens": 275129375.0, + "step": 10869 + }, + { + "epoch": 1.1937184274104986, + "grad_norm": 2.229175090789795, + "learning_rate": 1e-06, + "loss": 0.8635, + "mean_token_accuracy": 0.7238023281097412, + "num_tokens": 275156153.0, + "step": 10870 + }, + { + "epoch": 1.1938282451131121, + "grad_norm": 2.226559638977051, + "learning_rate": 1e-06, + "loss": 1.0217, + "mean_token_accuracy": 0.692923903465271, + "num_tokens": 275181576.0, + "step": 10871 + }, + { + "epoch": 1.193938062815726, + "grad_norm": 2.290510416030884, + "learning_rate": 1e-06, + "loss": 0.8814, + "mean_token_accuracy": 0.7182123064994812, + "num_tokens": 275203991.0, + "step": 10872 + }, + { + "epoch": 1.1940478805183394, + "grad_norm": 2.1630091667175293, + "learning_rate": 1e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.7183308005332947, + "num_tokens": 275231072.0, + "step": 10873 + }, + { + "epoch": 1.1941576982209532, + "grad_norm": 2.0191054344177246, + "learning_rate": 1e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.7080422639846802, + "num_tokens": 275262082.0, + "step": 10874 + }, + { + "epoch": 1.194267515923567, + "grad_norm": 2.2930541038513184, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7071423530578613, + "num_tokens": 275288194.0, + "step": 10875 + }, + { + "epoch": 1.1943773336261805, + "grad_norm": 2.6056172847747803, + "learning_rate": 1e-06, + "loss": 0.9373, + "mean_token_accuracy": 0.7076346278190613, + "num_tokens": 275308621.0, + "step": 10876 + }, + { + "epoch": 1.1944871513287942, + "grad_norm": 2.193509817123413, + "learning_rate": 1e-06, + "loss": 0.9024, + "mean_token_accuracy": 0.7243514060974121, + "num_tokens": 275333950.0, + "step": 10877 + }, + { + "epoch": 1.1945969690314078, + "grad_norm": 2.3704681396484375, + "learning_rate": 1e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.7169302701950073, + "num_tokens": 275357714.0, + "step": 10878 + }, + { + "epoch": 1.1947067867340215, + "grad_norm": 2.123828411102295, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.711135745048523, + "num_tokens": 275386445.0, + "step": 10879 + }, + { + "epoch": 1.1948166044366353, + "grad_norm": 2.396868944168091, + "learning_rate": 1e-06, + "loss": 0.8986, + "mean_token_accuracy": 0.7183767557144165, + "num_tokens": 275410567.0, + "step": 10880 + }, + { + "epoch": 1.1949264221392488, + "grad_norm": 2.2070298194885254, + "learning_rate": 1e-06, + "loss": 1.0101, + "mean_token_accuracy": 0.6952638030052185, + "num_tokens": 275439452.0, + "step": 10881 + }, + { + "epoch": 1.1950362398418626, + "grad_norm": 2.23852276802063, + "learning_rate": 1e-06, + "loss": 0.847, + "mean_token_accuracy": 0.733216404914856, + "num_tokens": 275462498.0, + "step": 10882 + }, + { + "epoch": 1.1951460575444761, + "grad_norm": 2.5449280738830566, + "learning_rate": 1e-06, + "loss": 0.8785, + "mean_token_accuracy": 0.721983790397644, + "num_tokens": 275483432.0, + "step": 10883 + }, + { + "epoch": 1.1952558752470899, + "grad_norm": 2.3175063133239746, + "learning_rate": 1e-06, + "loss": 0.9179, + "mean_token_accuracy": 0.7170026302337646, + "num_tokens": 275506494.0, + "step": 10884 + }, + { + "epoch": 1.1953656929497034, + "grad_norm": 2.5154097080230713, + "learning_rate": 1e-06, + "loss": 0.9234, + "mean_token_accuracy": 0.7121006846427917, + "num_tokens": 275528537.0, + "step": 10885 + }, + { + "epoch": 1.1954755106523172, + "grad_norm": 2.1334128379821777, + "learning_rate": 1e-06, + "loss": 0.894, + "mean_token_accuracy": 0.7287237644195557, + "num_tokens": 275554980.0, + "step": 10886 + }, + { + "epoch": 1.1955853283549307, + "grad_norm": 2.841470956802368, + "learning_rate": 1e-06, + "loss": 0.8258, + "mean_token_accuracy": 0.7343387603759766, + "num_tokens": 275572177.0, + "step": 10887 + }, + { + "epoch": 1.1956951460575445, + "grad_norm": 2.2711081504821777, + "learning_rate": 1e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.7184311747550964, + "num_tokens": 275597697.0, + "step": 10888 + }, + { + "epoch": 1.1958049637601582, + "grad_norm": 2.194983959197998, + "learning_rate": 1e-06, + "loss": 0.849, + "mean_token_accuracy": 0.7362534999847412, + "num_tokens": 275621025.0, + "step": 10889 + }, + { + "epoch": 1.1959147814627717, + "grad_norm": 2.084808349609375, + "learning_rate": 1e-06, + "loss": 0.8886, + "mean_token_accuracy": 0.7218737602233887, + "num_tokens": 275648105.0, + "step": 10890 + }, + { + "epoch": 1.1960245991653855, + "grad_norm": 2.163116693496704, + "learning_rate": 1e-06, + "loss": 0.8202, + "mean_token_accuracy": 0.7509746551513672, + "num_tokens": 275675807.0, + "step": 10891 + }, + { + "epoch": 1.196134416867999, + "grad_norm": 2.268850803375244, + "learning_rate": 1e-06, + "loss": 0.779, + "mean_token_accuracy": 0.748174786567688, + "num_tokens": 275698244.0, + "step": 10892 + }, + { + "epoch": 1.1962442345706128, + "grad_norm": 2.3641536235809326, + "learning_rate": 1e-06, + "loss": 0.8782, + "mean_token_accuracy": 0.7349382638931274, + "num_tokens": 275720059.0, + "step": 10893 + }, + { + "epoch": 1.1963540522732266, + "grad_norm": 2.0976428985595703, + "learning_rate": 1e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.7085822820663452, + "num_tokens": 275748377.0, + "step": 10894 + }, + { + "epoch": 1.19646386997584, + "grad_norm": 2.3664824962615967, + "learning_rate": 1e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.7155197262763977, + "num_tokens": 275772596.0, + "step": 10895 + }, + { + "epoch": 1.1965736876784538, + "grad_norm": 2.173250436782837, + "learning_rate": 1e-06, + "loss": 0.9272, + "mean_token_accuracy": 0.7172374129295349, + "num_tokens": 275796625.0, + "step": 10896 + }, + { + "epoch": 1.1966835053810674, + "grad_norm": 2.236863136291504, + "learning_rate": 1e-06, + "loss": 0.9055, + "mean_token_accuracy": 0.71756511926651, + "num_tokens": 275821625.0, + "step": 10897 + }, + { + "epoch": 1.1967933230836811, + "grad_norm": 2.1957778930664062, + "learning_rate": 1e-06, + "loss": 0.8537, + "mean_token_accuracy": 0.7290043830871582, + "num_tokens": 275845177.0, + "step": 10898 + }, + { + "epoch": 1.1969031407862947, + "grad_norm": 1.998068928718567, + "learning_rate": 1e-06, + "loss": 0.9629, + "mean_token_accuracy": 0.7025607228279114, + "num_tokens": 275877483.0, + "step": 10899 + }, + { + "epoch": 1.1970129584889084, + "grad_norm": 2.176156520843506, + "learning_rate": 1e-06, + "loss": 0.8394, + "mean_token_accuracy": 0.7295018434524536, + "num_tokens": 275904465.0, + "step": 10900 + }, + { + "epoch": 1.197122776191522, + "grad_norm": 2.1415984630584717, + "learning_rate": 1e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.7014869451522827, + "num_tokens": 275929973.0, + "step": 10901 + }, + { + "epoch": 1.1972325938941357, + "grad_norm": 2.321810245513916, + "learning_rate": 1e-06, + "loss": 0.7665, + "mean_token_accuracy": 0.7559022903442383, + "num_tokens": 275951637.0, + "step": 10902 + }, + { + "epoch": 1.1973424115967495, + "grad_norm": 2.3786120414733887, + "learning_rate": 1e-06, + "loss": 0.8165, + "mean_token_accuracy": 0.7375379800796509, + "num_tokens": 275974342.0, + "step": 10903 + }, + { + "epoch": 1.197452229299363, + "grad_norm": 2.479307174682617, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.707660436630249, + "num_tokens": 275996635.0, + "step": 10904 + }, + { + "epoch": 1.1975620470019768, + "grad_norm": 2.425046682357788, + "learning_rate": 1e-06, + "loss": 0.8831, + "mean_token_accuracy": 0.7243425846099854, + "num_tokens": 276018546.0, + "step": 10905 + }, + { + "epoch": 1.1976718647045903, + "grad_norm": 2.8544209003448486, + "learning_rate": 1e-06, + "loss": 0.8889, + "mean_token_accuracy": 0.7187087535858154, + "num_tokens": 276036382.0, + "step": 10906 + }, + { + "epoch": 1.197781682407204, + "grad_norm": 2.0629470348358154, + "learning_rate": 1e-06, + "loss": 0.8557, + "mean_token_accuracy": 0.7321957945823669, + "num_tokens": 276064355.0, + "step": 10907 + }, + { + "epoch": 1.1978915001098178, + "grad_norm": 2.519519090652466, + "learning_rate": 1e-06, + "loss": 0.8663, + "mean_token_accuracy": 0.7402966618537903, + "num_tokens": 276085536.0, + "step": 10908 + }, + { + "epoch": 1.1980013178124314, + "grad_norm": 2.249389886856079, + "learning_rate": 1e-06, + "loss": 0.9246, + "mean_token_accuracy": 0.7223786115646362, + "num_tokens": 276112137.0, + "step": 10909 + }, + { + "epoch": 1.1981111355150451, + "grad_norm": 2.4993956089019775, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.702122688293457, + "num_tokens": 276133314.0, + "step": 10910 + }, + { + "epoch": 1.1982209532176586, + "grad_norm": 2.272122383117676, + "learning_rate": 1e-06, + "loss": 0.8526, + "mean_token_accuracy": 0.7334611415863037, + "num_tokens": 276159256.0, + "step": 10911 + }, + { + "epoch": 1.1983307709202724, + "grad_norm": 2.0389256477355957, + "learning_rate": 1e-06, + "loss": 0.8556, + "mean_token_accuracy": 0.732262372970581, + "num_tokens": 276188051.0, + "step": 10912 + }, + { + "epoch": 1.198440588622886, + "grad_norm": 2.403919219970703, + "learning_rate": 1e-06, + "loss": 0.8477, + "mean_token_accuracy": 0.7267951965332031, + "num_tokens": 276211091.0, + "step": 10913 + }, + { + "epoch": 1.1985504063254997, + "grad_norm": 1.9721513986587524, + "learning_rate": 1e-06, + "loss": 1.0116, + "mean_token_accuracy": 0.6986971497535706, + "num_tokens": 276242575.0, + "step": 10914 + }, + { + "epoch": 1.1986602240281132, + "grad_norm": 1.949710726737976, + "learning_rate": 1e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.7048187255859375, + "num_tokens": 276276967.0, + "step": 10915 + }, + { + "epoch": 1.198770041730727, + "grad_norm": 2.2404403686523438, + "learning_rate": 1e-06, + "loss": 0.8926, + "mean_token_accuracy": 0.728070855140686, + "num_tokens": 276301333.0, + "step": 10916 + }, + { + "epoch": 1.1988798594333407, + "grad_norm": 2.205585241317749, + "learning_rate": 1e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.7255775928497314, + "num_tokens": 276326912.0, + "step": 10917 + }, + { + "epoch": 1.1989896771359543, + "grad_norm": 2.1468148231506348, + "learning_rate": 1e-06, + "loss": 0.9236, + "mean_token_accuracy": 0.7136190533638, + "num_tokens": 276355207.0, + "step": 10918 + }, + { + "epoch": 1.199099494838568, + "grad_norm": 2.8813512325286865, + "learning_rate": 1e-06, + "loss": 0.8352, + "mean_token_accuracy": 0.7302093505859375, + "num_tokens": 276371775.0, + "step": 10919 + }, + { + "epoch": 1.1992093125411816, + "grad_norm": 2.1779181957244873, + "learning_rate": 1e-06, + "loss": 0.9436, + "mean_token_accuracy": 0.7040436267852783, + "num_tokens": 276399438.0, + "step": 10920 + }, + { + "epoch": 1.1993191302437953, + "grad_norm": 2.0938472747802734, + "learning_rate": 1e-06, + "loss": 0.9071, + "mean_token_accuracy": 0.7188082933425903, + "num_tokens": 276428213.0, + "step": 10921 + }, + { + "epoch": 1.199428947946409, + "grad_norm": 2.263338327407837, + "learning_rate": 1e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7218912839889526, + "num_tokens": 276453021.0, + "step": 10922 + }, + { + "epoch": 1.1995387656490226, + "grad_norm": 2.1515660285949707, + "learning_rate": 1e-06, + "loss": 0.9037, + "mean_token_accuracy": 0.7216206789016724, + "num_tokens": 276480491.0, + "step": 10923 + }, + { + "epoch": 1.1996485833516364, + "grad_norm": 2.251549482345581, + "learning_rate": 1e-06, + "loss": 0.8356, + "mean_token_accuracy": 0.7365357279777527, + "num_tokens": 276503847.0, + "step": 10924 + }, + { + "epoch": 1.19975840105425, + "grad_norm": 2.41264009475708, + "learning_rate": 1e-06, + "loss": 0.878, + "mean_token_accuracy": 0.7323945164680481, + "num_tokens": 276527100.0, + "step": 10925 + }, + { + "epoch": 1.1998682187568637, + "grad_norm": 2.0465033054351807, + "learning_rate": 1e-06, + "loss": 0.8292, + "mean_token_accuracy": 0.7359511852264404, + "num_tokens": 276555870.0, + "step": 10926 + }, + { + "epoch": 1.1999780364594772, + "grad_norm": 2.1063971519470215, + "learning_rate": 1e-06, + "loss": 0.8036, + "mean_token_accuracy": 0.749589204788208, + "num_tokens": 276583793.0, + "step": 10927 + }, + { + "epoch": 1.200087854162091, + "grad_norm": 2.396188259124756, + "learning_rate": 1e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.7080049514770508, + "num_tokens": 276608503.0, + "step": 10928 + }, + { + "epoch": 1.2001976718647045, + "grad_norm": 2.0678322315216064, + "learning_rate": 1e-06, + "loss": 0.9276, + "mean_token_accuracy": 0.7087361812591553, + "num_tokens": 276638632.0, + "step": 10929 + }, + { + "epoch": 1.2003074895673183, + "grad_norm": 2.206342935562134, + "learning_rate": 1e-06, + "loss": 0.9477, + "mean_token_accuracy": 0.6986174583435059, + "num_tokens": 276664817.0, + "step": 10930 + }, + { + "epoch": 1.200417307269932, + "grad_norm": 2.65494966506958, + "learning_rate": 1e-06, + "loss": 0.8394, + "mean_token_accuracy": 0.7360186576843262, + "num_tokens": 276683940.0, + "step": 10931 + }, + { + "epoch": 1.2005271249725455, + "grad_norm": 2.45499849319458, + "learning_rate": 1e-06, + "loss": 0.8961, + "mean_token_accuracy": 0.7280945181846619, + "num_tokens": 276706060.0, + "step": 10932 + }, + { + "epoch": 1.2006369426751593, + "grad_norm": 2.3066582679748535, + "learning_rate": 1e-06, + "loss": 0.8678, + "mean_token_accuracy": 0.7355152368545532, + "num_tokens": 276730784.0, + "step": 10933 + }, + { + "epoch": 1.2007467603777728, + "grad_norm": 2.183366298675537, + "learning_rate": 1e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7134432792663574, + "num_tokens": 276758051.0, + "step": 10934 + }, + { + "epoch": 1.2008565780803866, + "grad_norm": 2.577087640762329, + "learning_rate": 1e-06, + "loss": 1.0124, + "mean_token_accuracy": 0.6966617107391357, + "num_tokens": 276784059.0, + "step": 10935 + }, + { + "epoch": 1.2009663957830001, + "grad_norm": 2.1490721702575684, + "learning_rate": 1e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.7045442461967468, + "num_tokens": 276813092.0, + "step": 10936 + }, + { + "epoch": 1.2010762134856139, + "grad_norm": 2.5325541496276855, + "learning_rate": 1e-06, + "loss": 0.8605, + "mean_token_accuracy": 0.7306281328201294, + "num_tokens": 276833840.0, + "step": 10937 + }, + { + "epoch": 1.2011860311882274, + "grad_norm": 2.3800039291381836, + "learning_rate": 1e-06, + "loss": 0.8767, + "mean_token_accuracy": 0.7179832458496094, + "num_tokens": 276857414.0, + "step": 10938 + }, + { + "epoch": 1.2012958488908412, + "grad_norm": 2.103438377380371, + "learning_rate": 1e-06, + "loss": 0.8432, + "mean_token_accuracy": 0.7330557107925415, + "num_tokens": 276884026.0, + "step": 10939 + }, + { + "epoch": 1.201405666593455, + "grad_norm": 2.082767963409424, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.6977152824401855, + "num_tokens": 276915325.0, + "step": 10940 + }, + { + "epoch": 1.2015154842960685, + "grad_norm": 2.4554383754730225, + "learning_rate": 1e-06, + "loss": 0.9485, + "mean_token_accuracy": 0.7068712115287781, + "num_tokens": 276939150.0, + "step": 10941 + }, + { + "epoch": 1.2016253019986822, + "grad_norm": 2.0250391960144043, + "learning_rate": 1e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.7087075710296631, + "num_tokens": 276971489.0, + "step": 10942 + }, + { + "epoch": 1.2017351197012958, + "grad_norm": 2.462435722351074, + "learning_rate": 1e-06, + "loss": 0.8281, + "mean_token_accuracy": 0.737163782119751, + "num_tokens": 276992776.0, + "step": 10943 + }, + { + "epoch": 1.2018449374039095, + "grad_norm": 2.2236573696136475, + "learning_rate": 1e-06, + "loss": 0.8608, + "mean_token_accuracy": 0.7326605916023254, + "num_tokens": 277017654.0, + "step": 10944 + }, + { + "epoch": 1.2019547551065233, + "grad_norm": 2.6841881275177, + "learning_rate": 1e-06, + "loss": 0.9198, + "mean_token_accuracy": 0.7216716408729553, + "num_tokens": 277037610.0, + "step": 10945 + }, + { + "epoch": 1.2020645728091368, + "grad_norm": 2.303650379180908, + "learning_rate": 1e-06, + "loss": 0.8844, + "mean_token_accuracy": 0.7252810597419739, + "num_tokens": 277060499.0, + "step": 10946 + }, + { + "epoch": 1.2021743905117506, + "grad_norm": 2.2644388675689697, + "learning_rate": 1e-06, + "loss": 0.8294, + "mean_token_accuracy": 0.7402482032775879, + "num_tokens": 277084376.0, + "step": 10947 + }, + { + "epoch": 1.202284208214364, + "grad_norm": 2.575223207473755, + "learning_rate": 1e-06, + "loss": 0.871, + "mean_token_accuracy": 0.7234779000282288, + "num_tokens": 277106762.0, + "step": 10948 + }, + { + "epoch": 1.2023940259169779, + "grad_norm": 2.278581142425537, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.7092505693435669, + "num_tokens": 277132360.0, + "step": 10949 + }, + { + "epoch": 1.2025038436195914, + "grad_norm": 2.2012245655059814, + "learning_rate": 1e-06, + "loss": 0.8571, + "mean_token_accuracy": 0.7280905246734619, + "num_tokens": 277157906.0, + "step": 10950 + }, + { + "epoch": 1.2026136613222052, + "grad_norm": 2.1108126640319824, + "learning_rate": 1e-06, + "loss": 0.8634, + "mean_token_accuracy": 0.73375403881073, + "num_tokens": 277185711.0, + "step": 10951 + }, + { + "epoch": 1.2027234790248187, + "grad_norm": 2.0176968574523926, + "learning_rate": 1e-06, + "loss": 1.0335, + "mean_token_accuracy": 0.6835065484046936, + "num_tokens": 277217594.0, + "step": 10952 + }, + { + "epoch": 1.2028332967274324, + "grad_norm": 2.331584930419922, + "learning_rate": 1e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.7207399010658264, + "num_tokens": 277242629.0, + "step": 10953 + }, + { + "epoch": 1.2029431144300462, + "grad_norm": 2.3340506553649902, + "learning_rate": 1e-06, + "loss": 0.8334, + "mean_token_accuracy": 0.7413352727890015, + "num_tokens": 277266153.0, + "step": 10954 + }, + { + "epoch": 1.2030529321326597, + "grad_norm": 2.213926315307617, + "learning_rate": 1e-06, + "loss": 0.8551, + "mean_token_accuracy": 0.7369985580444336, + "num_tokens": 277292648.0, + "step": 10955 + }, + { + "epoch": 1.2031627498352735, + "grad_norm": 2.1987011432647705, + "learning_rate": 1e-06, + "loss": 0.8898, + "mean_token_accuracy": 0.7221274971961975, + "num_tokens": 277319558.0, + "step": 10956 + }, + { + "epoch": 1.203272567537887, + "grad_norm": 2.165116786956787, + "learning_rate": 1e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.707349956035614, + "num_tokens": 277347372.0, + "step": 10957 + }, + { + "epoch": 1.2033823852405008, + "grad_norm": 2.02840518951416, + "learning_rate": 1e-06, + "loss": 0.9349, + "mean_token_accuracy": 0.715516209602356, + "num_tokens": 277379625.0, + "step": 10958 + }, + { + "epoch": 1.2034922029431145, + "grad_norm": 2.313870668411255, + "learning_rate": 1e-06, + "loss": 0.8716, + "mean_token_accuracy": 0.7327170372009277, + "num_tokens": 277403157.0, + "step": 10959 + }, + { + "epoch": 1.203602020645728, + "grad_norm": 2.226651906967163, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7102669477462769, + "num_tokens": 277427671.0, + "step": 10960 + }, + { + "epoch": 1.2037118383483418, + "grad_norm": 2.2861931324005127, + "learning_rate": 1e-06, + "loss": 0.859, + "mean_token_accuracy": 0.7335357069969177, + "num_tokens": 277452530.0, + "step": 10961 + }, + { + "epoch": 1.2038216560509554, + "grad_norm": 2.1730880737304688, + "learning_rate": 1e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.7113705277442932, + "num_tokens": 277480542.0, + "step": 10962 + }, + { + "epoch": 1.2039314737535691, + "grad_norm": 2.448136329650879, + "learning_rate": 1e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.7150511741638184, + "num_tokens": 277502116.0, + "step": 10963 + }, + { + "epoch": 1.2040412914561827, + "grad_norm": 2.407585620880127, + "learning_rate": 1e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7178763151168823, + "num_tokens": 277524188.0, + "step": 10964 + }, + { + "epoch": 1.2041511091587964, + "grad_norm": 2.357314348220825, + "learning_rate": 1e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.7179818749427795, + "num_tokens": 277547723.0, + "step": 10965 + }, + { + "epoch": 1.20426092686141, + "grad_norm": 2.7655317783355713, + "learning_rate": 1e-06, + "loss": 0.8588, + "mean_token_accuracy": 0.7267177104949951, + "num_tokens": 277567236.0, + "step": 10966 + }, + { + "epoch": 1.2043707445640237, + "grad_norm": 2.4798390865325928, + "learning_rate": 1e-06, + "loss": 0.889, + "mean_token_accuracy": 0.7246910333633423, + "num_tokens": 277589293.0, + "step": 10967 + }, + { + "epoch": 1.2044805622666375, + "grad_norm": 2.4532687664031982, + "learning_rate": 1e-06, + "loss": 0.8677, + "mean_token_accuracy": 0.7294260263442993, + "num_tokens": 277611219.0, + "step": 10968 + }, + { + "epoch": 1.204590379969251, + "grad_norm": 2.1968283653259277, + "learning_rate": 1e-06, + "loss": 0.8922, + "mean_token_accuracy": 0.718489408493042, + "num_tokens": 277638158.0, + "step": 10969 + }, + { + "epoch": 1.2047001976718648, + "grad_norm": 2.2930221557617188, + "learning_rate": 1e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.7050566077232361, + "num_tokens": 277663113.0, + "step": 10970 + }, + { + "epoch": 1.2048100153744783, + "grad_norm": 2.3784868717193604, + "learning_rate": 1e-06, + "loss": 0.8227, + "mean_token_accuracy": 0.7332367897033691, + "num_tokens": 277686890.0, + "step": 10971 + }, + { + "epoch": 1.204919833077092, + "grad_norm": 2.3582711219787598, + "learning_rate": 1e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7241463661193848, + "num_tokens": 277711230.0, + "step": 10972 + }, + { + "epoch": 1.2050296507797058, + "grad_norm": 2.1312415599823, + "learning_rate": 1e-06, + "loss": 0.8381, + "mean_token_accuracy": 0.7333617210388184, + "num_tokens": 277738746.0, + "step": 10973 + }, + { + "epoch": 1.2051394684823193, + "grad_norm": 2.6510872840881348, + "learning_rate": 1e-06, + "loss": 0.7241, + "mean_token_accuracy": 0.7673121094703674, + "num_tokens": 277757530.0, + "step": 10974 + }, + { + "epoch": 1.205249286184933, + "grad_norm": 2.5540990829467773, + "learning_rate": 1e-06, + "loss": 0.8258, + "mean_token_accuracy": 0.7360886335372925, + "num_tokens": 277776179.0, + "step": 10975 + }, + { + "epoch": 1.2053591038875466, + "grad_norm": 2.2067151069641113, + "learning_rate": 1e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.7266276478767395, + "num_tokens": 277801812.0, + "step": 10976 + }, + { + "epoch": 1.2054689215901604, + "grad_norm": 2.280115842819214, + "learning_rate": 1e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.7078844904899597, + "num_tokens": 277829107.0, + "step": 10977 + }, + { + "epoch": 1.205578739292774, + "grad_norm": 2.134727716445923, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7006316184997559, + "num_tokens": 277856081.0, + "step": 10978 + }, + { + "epoch": 1.2056885569953877, + "grad_norm": 2.0589799880981445, + "learning_rate": 1e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.7068423628807068, + "num_tokens": 277886946.0, + "step": 10979 + }, + { + "epoch": 1.2057983746980012, + "grad_norm": 2.1760361194610596, + "learning_rate": 1e-06, + "loss": 0.8728, + "mean_token_accuracy": 0.7320145964622498, + "num_tokens": 277913927.0, + "step": 10980 + }, + { + "epoch": 1.205908192400615, + "grad_norm": 1.9784328937530518, + "learning_rate": 1e-06, + "loss": 0.8882, + "mean_token_accuracy": 0.731223464012146, + "num_tokens": 277943084.0, + "step": 10981 + }, + { + "epoch": 1.2060180101032287, + "grad_norm": 2.180197238922119, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7115359306335449, + "num_tokens": 277969771.0, + "step": 10982 + }, + { + "epoch": 1.2061278278058423, + "grad_norm": 2.2324540615081787, + "learning_rate": 1e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7170765399932861, + "num_tokens": 277995721.0, + "step": 10983 + }, + { + "epoch": 1.206237645508456, + "grad_norm": 2.0895683765411377, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7106953859329224, + "num_tokens": 278023278.0, + "step": 10984 + }, + { + "epoch": 1.2063474632110696, + "grad_norm": 2.386784791946411, + "learning_rate": 1e-06, + "loss": 0.8914, + "mean_token_accuracy": 0.7221416234970093, + "num_tokens": 278046540.0, + "step": 10985 + }, + { + "epoch": 1.2064572809136833, + "grad_norm": 2.3020033836364746, + "learning_rate": 1e-06, + "loss": 0.9276, + "mean_token_accuracy": 0.7154076099395752, + "num_tokens": 278070903.0, + "step": 10986 + }, + { + "epoch": 1.2065670986162969, + "grad_norm": 2.3215041160583496, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7252328395843506, + "num_tokens": 278095115.0, + "step": 10987 + }, + { + "epoch": 1.2066769163189106, + "grad_norm": 2.085909843444824, + "learning_rate": 1e-06, + "loss": 0.8814, + "mean_token_accuracy": 0.7335623502731323, + "num_tokens": 278121963.0, + "step": 10988 + }, + { + "epoch": 1.2067867340215244, + "grad_norm": 2.4320788383483887, + "learning_rate": 1e-06, + "loss": 0.8385, + "mean_token_accuracy": 0.7350819110870361, + "num_tokens": 278142853.0, + "step": 10989 + }, + { + "epoch": 1.206896551724138, + "grad_norm": 2.4932172298431396, + "learning_rate": 1e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.7195810675621033, + "num_tokens": 278165111.0, + "step": 10990 + }, + { + "epoch": 1.2070063694267517, + "grad_norm": 2.4471490383148193, + "learning_rate": 1e-06, + "loss": 0.9154, + "mean_token_accuracy": 0.7175592184066772, + "num_tokens": 278187393.0, + "step": 10991 + }, + { + "epoch": 1.2071161871293652, + "grad_norm": 2.302368640899658, + "learning_rate": 1e-06, + "loss": 0.9657, + "mean_token_accuracy": 0.7050776481628418, + "num_tokens": 278211109.0, + "step": 10992 + }, + { + "epoch": 1.207226004831979, + "grad_norm": 2.414736747741699, + "learning_rate": 1e-06, + "loss": 0.8646, + "mean_token_accuracy": 0.7293998599052429, + "num_tokens": 278233276.0, + "step": 10993 + }, + { + "epoch": 1.2073358225345925, + "grad_norm": 2.2130610942840576, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7208666801452637, + "num_tokens": 278259070.0, + "step": 10994 + }, + { + "epoch": 1.2074456402372062, + "grad_norm": 2.447800397872925, + "learning_rate": 1e-06, + "loss": 0.8567, + "mean_token_accuracy": 0.7298762798309326, + "num_tokens": 278282916.0, + "step": 10995 + }, + { + "epoch": 1.20755545793982, + "grad_norm": 2.5593340396881104, + "learning_rate": 1e-06, + "loss": 0.8362, + "mean_token_accuracy": 0.7406466007232666, + "num_tokens": 278303964.0, + "step": 10996 + }, + { + "epoch": 1.2076652756424335, + "grad_norm": 2.325942277908325, + "learning_rate": 1e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.7085131406784058, + "num_tokens": 278329202.0, + "step": 10997 + }, + { + "epoch": 1.2077750933450473, + "grad_norm": 2.2136898040771484, + "learning_rate": 1e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.7211270332336426, + "num_tokens": 278355683.0, + "step": 10998 + }, + { + "epoch": 1.2078849110476608, + "grad_norm": 2.6215262413024902, + "learning_rate": 1e-06, + "loss": 0.8326, + "mean_token_accuracy": 0.7355257868766785, + "num_tokens": 278375963.0, + "step": 10999 + }, + { + "epoch": 1.2079947287502746, + "grad_norm": 2.077486753463745, + "learning_rate": 1e-06, + "loss": 0.7933, + "mean_token_accuracy": 0.7501318454742432, + "num_tokens": 278401353.0, + "step": 11000 + }, + { + "epoch": 1.2081045464528881, + "grad_norm": 2.2586357593536377, + "learning_rate": 1e-06, + "loss": 0.8926, + "mean_token_accuracy": 0.7233564853668213, + "num_tokens": 278425923.0, + "step": 11001 + }, + { + "epoch": 1.2082143641555019, + "grad_norm": 2.3109066486358643, + "learning_rate": 1e-06, + "loss": 0.8582, + "mean_token_accuracy": 0.7249021530151367, + "num_tokens": 278449365.0, + "step": 11002 + }, + { + "epoch": 1.2083241818581154, + "grad_norm": 2.1851093769073486, + "learning_rate": 1e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.7209852337837219, + "num_tokens": 278477309.0, + "step": 11003 + }, + { + "epoch": 1.2084339995607292, + "grad_norm": 2.3199267387390137, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.6949796676635742, + "num_tokens": 278502543.0, + "step": 11004 + }, + { + "epoch": 1.208543817263343, + "grad_norm": 2.3821983337402344, + "learning_rate": 1e-06, + "loss": 0.925, + "mean_token_accuracy": 0.7194366455078125, + "num_tokens": 278527586.0, + "step": 11005 + }, + { + "epoch": 1.2086536349659565, + "grad_norm": 2.223851442337036, + "learning_rate": 1e-06, + "loss": 0.8398, + "mean_token_accuracy": 0.7301597595214844, + "num_tokens": 278552653.0, + "step": 11006 + }, + { + "epoch": 1.2087634526685702, + "grad_norm": 2.004002809524536, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7087317705154419, + "num_tokens": 278582469.0, + "step": 11007 + }, + { + "epoch": 1.2088732703711838, + "grad_norm": 2.2391233444213867, + "learning_rate": 1e-06, + "loss": 0.8201, + "mean_token_accuracy": 0.7425004243850708, + "num_tokens": 278606689.0, + "step": 11008 + }, + { + "epoch": 1.2089830880737975, + "grad_norm": 2.187274217605591, + "learning_rate": 1e-06, + "loss": 0.9223, + "mean_token_accuracy": 0.7145910263061523, + "num_tokens": 278634686.0, + "step": 11009 + }, + { + "epoch": 1.2090929057764113, + "grad_norm": 2.4425594806671143, + "learning_rate": 1e-06, + "loss": 0.8639, + "mean_token_accuracy": 0.7311915159225464, + "num_tokens": 278659149.0, + "step": 11010 + }, + { + "epoch": 1.2092027234790248, + "grad_norm": 2.1128835678100586, + "learning_rate": 1e-06, + "loss": 0.8515, + "mean_token_accuracy": 0.7369064092636108, + "num_tokens": 278685421.0, + "step": 11011 + }, + { + "epoch": 1.2093125411816386, + "grad_norm": 2.197601318359375, + "learning_rate": 1e-06, + "loss": 0.8924, + "mean_token_accuracy": 0.7238391637802124, + "num_tokens": 278710110.0, + "step": 11012 + }, + { + "epoch": 1.209422358884252, + "grad_norm": 2.1659233570098877, + "learning_rate": 1e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.7184556722640991, + "num_tokens": 278736439.0, + "step": 11013 + }, + { + "epoch": 1.2095321765868658, + "grad_norm": 2.0958895683288574, + "learning_rate": 1e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7105891704559326, + "num_tokens": 278764842.0, + "step": 11014 + }, + { + "epoch": 1.2096419942894794, + "grad_norm": 2.2101263999938965, + "learning_rate": 1e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.7034409046173096, + "num_tokens": 278790665.0, + "step": 11015 + }, + { + "epoch": 1.2097518119920931, + "grad_norm": 2.1759722232818604, + "learning_rate": 1e-06, + "loss": 0.8297, + "mean_token_accuracy": 0.7295093536376953, + "num_tokens": 278816086.0, + "step": 11016 + }, + { + "epoch": 1.2098616296947067, + "grad_norm": 2.3626065254211426, + "learning_rate": 1e-06, + "loss": 0.7822, + "mean_token_accuracy": 0.7593476176261902, + "num_tokens": 278838301.0, + "step": 11017 + }, + { + "epoch": 1.2099714473973204, + "grad_norm": 2.4079954624176025, + "learning_rate": 1e-06, + "loss": 0.9317, + "mean_token_accuracy": 0.7112730145454407, + "num_tokens": 278860238.0, + "step": 11018 + }, + { + "epoch": 1.2100812650999342, + "grad_norm": 1.954203724861145, + "learning_rate": 1e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.7123483419418335, + "num_tokens": 278892062.0, + "step": 11019 + }, + { + "epoch": 1.2101910828025477, + "grad_norm": 2.34269118309021, + "learning_rate": 1e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7205905914306641, + "num_tokens": 278917607.0, + "step": 11020 + }, + { + "epoch": 1.2103009005051615, + "grad_norm": 2.23578143119812, + "learning_rate": 1e-06, + "loss": 1.0184, + "mean_token_accuracy": 0.6914147138595581, + "num_tokens": 278945398.0, + "step": 11021 + }, + { + "epoch": 1.210410718207775, + "grad_norm": 2.176657199859619, + "learning_rate": 1e-06, + "loss": 0.8139, + "mean_token_accuracy": 0.7438190579414368, + "num_tokens": 278971282.0, + "step": 11022 + }, + { + "epoch": 1.2105205359103888, + "grad_norm": 2.0836739540100098, + "learning_rate": 1e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.697601854801178, + "num_tokens": 278999669.0, + "step": 11023 + }, + { + "epoch": 1.2106303536130025, + "grad_norm": 2.4372665882110596, + "learning_rate": 1e-06, + "loss": 0.8751, + "mean_token_accuracy": 0.7211418151855469, + "num_tokens": 279021151.0, + "step": 11024 + }, + { + "epoch": 1.210740171315616, + "grad_norm": 2.257370948791504, + "learning_rate": 1e-06, + "loss": 0.8744, + "mean_token_accuracy": 0.7261255979537964, + "num_tokens": 279045429.0, + "step": 11025 + }, + { + "epoch": 1.2108499890182298, + "grad_norm": 2.290174722671509, + "learning_rate": 1e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.7172989845275879, + "num_tokens": 279070136.0, + "step": 11026 + }, + { + "epoch": 1.2109598067208434, + "grad_norm": 2.0914835929870605, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7099204659461975, + "num_tokens": 279098937.0, + "step": 11027 + }, + { + "epoch": 1.2110696244234571, + "grad_norm": 2.4014625549316406, + "learning_rate": 1e-06, + "loss": 0.7806, + "mean_token_accuracy": 0.7480176687240601, + "num_tokens": 279122426.0, + "step": 11028 + }, + { + "epoch": 1.2111794421260706, + "grad_norm": 2.489947557449341, + "learning_rate": 1e-06, + "loss": 0.8663, + "mean_token_accuracy": 0.7319318056106567, + "num_tokens": 279142275.0, + "step": 11029 + }, + { + "epoch": 1.2112892598286844, + "grad_norm": 2.4681785106658936, + "learning_rate": 1e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7188555002212524, + "num_tokens": 279165434.0, + "step": 11030 + }, + { + "epoch": 1.211399077531298, + "grad_norm": 2.011054277420044, + "learning_rate": 1e-06, + "loss": 0.99, + "mean_token_accuracy": 0.6953498721122742, + "num_tokens": 279196917.0, + "step": 11031 + }, + { + "epoch": 1.2115088952339117, + "grad_norm": 2.359281301498413, + "learning_rate": 1e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.714393675327301, + "num_tokens": 279221183.0, + "step": 11032 + }, + { + "epoch": 1.2116187129365255, + "grad_norm": 2.3024942874908447, + "learning_rate": 1e-06, + "loss": 0.8211, + "mean_token_accuracy": 0.7415416836738586, + "num_tokens": 279244652.0, + "step": 11033 + }, + { + "epoch": 1.211728530639139, + "grad_norm": 1.9745968580245972, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7021138668060303, + "num_tokens": 279278152.0, + "step": 11034 + }, + { + "epoch": 1.2118383483417527, + "grad_norm": 2.171438694000244, + "learning_rate": 1e-06, + "loss": 0.949, + "mean_token_accuracy": 0.7023118734359741, + "num_tokens": 279305736.0, + "step": 11035 + }, + { + "epoch": 1.2119481660443663, + "grad_norm": 2.6255130767822266, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7147517204284668, + "num_tokens": 279326897.0, + "step": 11036 + }, + { + "epoch": 1.21205798374698, + "grad_norm": 2.164252758026123, + "learning_rate": 1e-06, + "loss": 0.893, + "mean_token_accuracy": 0.7146701812744141, + "num_tokens": 279354313.0, + "step": 11037 + }, + { + "epoch": 1.2121678014495938, + "grad_norm": 2.4406001567840576, + "learning_rate": 1e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.7189401388168335, + "num_tokens": 279377200.0, + "step": 11038 + }, + { + "epoch": 1.2122776191522073, + "grad_norm": 2.6953718662261963, + "learning_rate": 1e-06, + "loss": 0.8917, + "mean_token_accuracy": 0.7285877466201782, + "num_tokens": 279397266.0, + "step": 11039 + }, + { + "epoch": 1.212387436854821, + "grad_norm": 2.2342941761016846, + "learning_rate": 1e-06, + "loss": 0.8643, + "mean_token_accuracy": 0.7289386987686157, + "num_tokens": 279423063.0, + "step": 11040 + }, + { + "epoch": 1.2124972545574346, + "grad_norm": 2.3928518295288086, + "learning_rate": 1e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7224347591400146, + "num_tokens": 279445388.0, + "step": 11041 + }, + { + "epoch": 1.2126070722600484, + "grad_norm": 2.222435712814331, + "learning_rate": 1e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.7065107226371765, + "num_tokens": 279472095.0, + "step": 11042 + }, + { + "epoch": 1.212716889962662, + "grad_norm": 1.9617036581039429, + "learning_rate": 1e-06, + "loss": 0.8814, + "mean_token_accuracy": 0.7303171157836914, + "num_tokens": 279503614.0, + "step": 11043 + }, + { + "epoch": 1.2128267076652757, + "grad_norm": 2.4602861404418945, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7093913555145264, + "num_tokens": 279527130.0, + "step": 11044 + }, + { + "epoch": 1.2129365253678892, + "grad_norm": 2.4269299507141113, + "learning_rate": 1e-06, + "loss": 0.8265, + "mean_token_accuracy": 0.738259494304657, + "num_tokens": 279547509.0, + "step": 11045 + }, + { + "epoch": 1.213046343070503, + "grad_norm": 2.302769184112549, + "learning_rate": 1e-06, + "loss": 0.8548, + "mean_token_accuracy": 0.7339453101158142, + "num_tokens": 279569592.0, + "step": 11046 + }, + { + "epoch": 1.2131561607731167, + "grad_norm": 2.0360500812530518, + "learning_rate": 1e-06, + "loss": 0.934, + "mean_token_accuracy": 0.7075033187866211, + "num_tokens": 279599037.0, + "step": 11047 + }, + { + "epoch": 1.2132659784757303, + "grad_norm": 2.180981159210205, + "learning_rate": 1e-06, + "loss": 0.9523, + "mean_token_accuracy": 0.701896607875824, + "num_tokens": 279625738.0, + "step": 11048 + }, + { + "epoch": 1.213375796178344, + "grad_norm": 2.2803375720977783, + "learning_rate": 1e-06, + "loss": 0.8872, + "mean_token_accuracy": 0.7287116646766663, + "num_tokens": 279650182.0, + "step": 11049 + }, + { + "epoch": 1.2134856138809575, + "grad_norm": 3.022768974304199, + "learning_rate": 1e-06, + "loss": 0.8535, + "mean_token_accuracy": 0.7272803783416748, + "num_tokens": 279665282.0, + "step": 11050 + }, + { + "epoch": 1.2135954315835713, + "grad_norm": 2.427577495574951, + "learning_rate": 1e-06, + "loss": 0.9782, + "mean_token_accuracy": 0.6960535645484924, + "num_tokens": 279689053.0, + "step": 11051 + }, + { + "epoch": 1.2137052492861848, + "grad_norm": 2.328367233276367, + "learning_rate": 1e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.7164226174354553, + "num_tokens": 279714292.0, + "step": 11052 + }, + { + "epoch": 1.2138150669887986, + "grad_norm": 2.229320764541626, + "learning_rate": 1e-06, + "loss": 1.0113, + "mean_token_accuracy": 0.6874548196792603, + "num_tokens": 279742720.0, + "step": 11053 + }, + { + "epoch": 1.2139248846914121, + "grad_norm": 1.99029541015625, + "learning_rate": 1e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.7270655632019043, + "num_tokens": 279772796.0, + "step": 11054 + }, + { + "epoch": 1.2140347023940259, + "grad_norm": 2.0709097385406494, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.712448000907898, + "num_tokens": 279802263.0, + "step": 11055 + }, + { + "epoch": 1.2141445200966396, + "grad_norm": 2.372009754180908, + "learning_rate": 1e-06, + "loss": 0.9037, + "mean_token_accuracy": 0.7199983596801758, + "num_tokens": 279824936.0, + "step": 11056 + }, + { + "epoch": 1.2142543377992532, + "grad_norm": 2.3219833374023438, + "learning_rate": 1e-06, + "loss": 0.8758, + "mean_token_accuracy": 0.7280411720275879, + "num_tokens": 279849454.0, + "step": 11057 + }, + { + "epoch": 1.214364155501867, + "grad_norm": 2.3511741161346436, + "learning_rate": 1e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7037420868873596, + "num_tokens": 279873059.0, + "step": 11058 + }, + { + "epoch": 1.2144739732044805, + "grad_norm": 2.3768956661224365, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7093456983566284, + "num_tokens": 279896698.0, + "step": 11059 + }, + { + "epoch": 1.2145837909070942, + "grad_norm": 2.1515450477600098, + "learning_rate": 1e-06, + "loss": 1.0171, + "mean_token_accuracy": 0.6924260854721069, + "num_tokens": 279925349.0, + "step": 11060 + }, + { + "epoch": 1.214693608609708, + "grad_norm": 2.456188440322876, + "learning_rate": 1e-06, + "loss": 0.8737, + "mean_token_accuracy": 0.7324486970901489, + "num_tokens": 279948464.0, + "step": 11061 + }, + { + "epoch": 1.2148034263123215, + "grad_norm": 2.3193435668945312, + "learning_rate": 1e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.7155584096908569, + "num_tokens": 279972437.0, + "step": 11062 + }, + { + "epoch": 1.2149132440149353, + "grad_norm": 2.3343148231506348, + "learning_rate": 1e-06, + "loss": 0.8739, + "mean_token_accuracy": 0.7275491952896118, + "num_tokens": 279996101.0, + "step": 11063 + }, + { + "epoch": 1.2150230617175488, + "grad_norm": 2.555018186569214, + "learning_rate": 1e-06, + "loss": 0.8589, + "mean_token_accuracy": 0.7429872155189514, + "num_tokens": 280017112.0, + "step": 11064 + }, + { + "epoch": 1.2151328794201626, + "grad_norm": 2.179129123687744, + "learning_rate": 1e-06, + "loss": 0.8844, + "mean_token_accuracy": 0.7285082340240479, + "num_tokens": 280043644.0, + "step": 11065 + }, + { + "epoch": 1.215242697122776, + "grad_norm": 2.413045644760132, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7088866233825684, + "num_tokens": 280067168.0, + "step": 11066 + }, + { + "epoch": 1.2153525148253899, + "grad_norm": 2.4759578704833984, + "learning_rate": 1e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.7117753624916077, + "num_tokens": 280089158.0, + "step": 11067 + }, + { + "epoch": 1.2154623325280034, + "grad_norm": 2.464982748031616, + "learning_rate": 1e-06, + "loss": 0.8276, + "mean_token_accuracy": 0.7359206676483154, + "num_tokens": 280111981.0, + "step": 11068 + }, + { + "epoch": 1.2155721502306172, + "grad_norm": 2.236194372177124, + "learning_rate": 1e-06, + "loss": 0.992, + "mean_token_accuracy": 0.7060914635658264, + "num_tokens": 280139430.0, + "step": 11069 + }, + { + "epoch": 1.215681967933231, + "grad_norm": 2.703939437866211, + "learning_rate": 1e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.7284317016601562, + "num_tokens": 280157517.0, + "step": 11070 + }, + { + "epoch": 1.2157917856358444, + "grad_norm": 2.1699814796447754, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7074868083000183, + "num_tokens": 280185515.0, + "step": 11071 + }, + { + "epoch": 1.2159016033384582, + "grad_norm": 2.0605247020721436, + "learning_rate": 1e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.7151155471801758, + "num_tokens": 280214294.0, + "step": 11072 + }, + { + "epoch": 1.2160114210410717, + "grad_norm": 2.4847004413604736, + "learning_rate": 1e-06, + "loss": 0.8321, + "mean_token_accuracy": 0.7363067269325256, + "num_tokens": 280235254.0, + "step": 11073 + }, + { + "epoch": 1.2161212387436855, + "grad_norm": 2.206874132156372, + "learning_rate": 1e-06, + "loss": 0.8949, + "mean_token_accuracy": 0.716553807258606, + "num_tokens": 280262253.0, + "step": 11074 + }, + { + "epoch": 1.2162310564462993, + "grad_norm": 2.5806360244750977, + "learning_rate": 1e-06, + "loss": 0.8039, + "mean_token_accuracy": 0.7419605255126953, + "num_tokens": 280284208.0, + "step": 11075 + }, + { + "epoch": 1.2163408741489128, + "grad_norm": 2.3499643802642822, + "learning_rate": 1e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.7192643880844116, + "num_tokens": 280308272.0, + "step": 11076 + }, + { + "epoch": 1.2164506918515265, + "grad_norm": 2.4932456016540527, + "learning_rate": 1e-06, + "loss": 0.8095, + "mean_token_accuracy": 0.7407435178756714, + "num_tokens": 280329249.0, + "step": 11077 + }, + { + "epoch": 1.21656050955414, + "grad_norm": 2.1359827518463135, + "learning_rate": 1e-06, + "loss": 0.8805, + "mean_token_accuracy": 0.7325356006622314, + "num_tokens": 280355738.0, + "step": 11078 + }, + { + "epoch": 1.2166703272567538, + "grad_norm": 2.170250654220581, + "learning_rate": 1e-06, + "loss": 0.835, + "mean_token_accuracy": 0.735543429851532, + "num_tokens": 280381382.0, + "step": 11079 + }, + { + "epoch": 1.2167801449593674, + "grad_norm": 2.8049707412719727, + "learning_rate": 1e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.720386266708374, + "num_tokens": 280401206.0, + "step": 11080 + }, + { + "epoch": 1.2168899626619811, + "grad_norm": 2.3920748233795166, + "learning_rate": 1e-06, + "loss": 0.8847, + "mean_token_accuracy": 0.7250423431396484, + "num_tokens": 280426874.0, + "step": 11081 + }, + { + "epoch": 1.2169997803645947, + "grad_norm": 2.5699973106384277, + "learning_rate": 1e-06, + "loss": 0.8146, + "mean_token_accuracy": 0.7425031661987305, + "num_tokens": 280447106.0, + "step": 11082 + }, + { + "epoch": 1.2171095980672084, + "grad_norm": 2.431785821914673, + "learning_rate": 1e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7105575799942017, + "num_tokens": 280471519.0, + "step": 11083 + }, + { + "epoch": 1.2172194157698222, + "grad_norm": 2.2178256511688232, + "learning_rate": 1e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.7110339403152466, + "num_tokens": 280498125.0, + "step": 11084 + }, + { + "epoch": 1.2173292334724357, + "grad_norm": 2.3163952827453613, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.7081023454666138, + "num_tokens": 280523336.0, + "step": 11085 + }, + { + "epoch": 1.2174390511750495, + "grad_norm": 2.4207382202148438, + "learning_rate": 1e-06, + "loss": 0.8646, + "mean_token_accuracy": 0.723034143447876, + "num_tokens": 280545713.0, + "step": 11086 + }, + { + "epoch": 1.217548868877663, + "grad_norm": 2.4446895122528076, + "learning_rate": 1e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.7212697863578796, + "num_tokens": 280568297.0, + "step": 11087 + }, + { + "epoch": 1.2176586865802768, + "grad_norm": 2.3263888359069824, + "learning_rate": 1e-06, + "loss": 0.8344, + "mean_token_accuracy": 0.7392358779907227, + "num_tokens": 280593586.0, + "step": 11088 + }, + { + "epoch": 1.2177685042828905, + "grad_norm": 2.5043890476226807, + "learning_rate": 1e-06, + "loss": 0.8464, + "mean_token_accuracy": 0.7342950105667114, + "num_tokens": 280613908.0, + "step": 11089 + }, + { + "epoch": 1.217878321985504, + "grad_norm": 2.4530153274536133, + "learning_rate": 1e-06, + "loss": 0.8531, + "mean_token_accuracy": 0.7327930927276611, + "num_tokens": 280638116.0, + "step": 11090 + }, + { + "epoch": 1.2179881396881178, + "grad_norm": 2.4995977878570557, + "learning_rate": 1e-06, + "loss": 0.9071, + "mean_token_accuracy": 0.7218798398971558, + "num_tokens": 280658717.0, + "step": 11091 + }, + { + "epoch": 1.2180979573907313, + "grad_norm": 2.3680288791656494, + "learning_rate": 1e-06, + "loss": 0.8328, + "mean_token_accuracy": 0.7332244515419006, + "num_tokens": 280680918.0, + "step": 11092 + }, + { + "epoch": 1.218207775093345, + "grad_norm": 2.1986753940582275, + "learning_rate": 1e-06, + "loss": 0.9127, + "mean_token_accuracy": 0.7115719318389893, + "num_tokens": 280707345.0, + "step": 11093 + }, + { + "epoch": 1.2183175927959586, + "grad_norm": 1.8740715980529785, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7033330202102661, + "num_tokens": 280741735.0, + "step": 11094 + }, + { + "epoch": 1.2184274104985724, + "grad_norm": 2.565892457962036, + "learning_rate": 1e-06, + "loss": 0.8885, + "mean_token_accuracy": 0.7284810543060303, + "num_tokens": 280761959.0, + "step": 11095 + }, + { + "epoch": 1.218537228201186, + "grad_norm": 2.444007635116577, + "learning_rate": 1e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.7097797393798828, + "num_tokens": 280785328.0, + "step": 11096 + }, + { + "epoch": 1.2186470459037997, + "grad_norm": 2.435211658477783, + "learning_rate": 1e-06, + "loss": 0.9723, + "mean_token_accuracy": 0.7083209753036499, + "num_tokens": 280809362.0, + "step": 11097 + }, + { + "epoch": 1.2187568636064134, + "grad_norm": 2.4313738346099854, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.7155374884605408, + "num_tokens": 280835110.0, + "step": 11098 + }, + { + "epoch": 1.218866681309027, + "grad_norm": 2.510143995285034, + "learning_rate": 1e-06, + "loss": 0.8548, + "mean_token_accuracy": 0.7338396310806274, + "num_tokens": 280855432.0, + "step": 11099 + }, + { + "epoch": 1.2189764990116407, + "grad_norm": 2.517341136932373, + "learning_rate": 1e-06, + "loss": 0.864, + "mean_token_accuracy": 0.7325751185417175, + "num_tokens": 280877614.0, + "step": 11100 + }, + { + "epoch": 1.2190863167142543, + "grad_norm": 2.4259719848632812, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7110166549682617, + "num_tokens": 280900916.0, + "step": 11101 + }, + { + "epoch": 1.219196134416868, + "grad_norm": 2.3097779750823975, + "learning_rate": 1e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7246203422546387, + "num_tokens": 280926492.0, + "step": 11102 + }, + { + "epoch": 1.2193059521194818, + "grad_norm": 2.325791597366333, + "learning_rate": 1e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.7002359628677368, + "num_tokens": 280952795.0, + "step": 11103 + }, + { + "epoch": 1.2194157698220953, + "grad_norm": 2.254143238067627, + "learning_rate": 1e-06, + "loss": 0.8934, + "mean_token_accuracy": 0.7178668975830078, + "num_tokens": 280977111.0, + "step": 11104 + }, + { + "epoch": 1.219525587524709, + "grad_norm": 2.3037502765655518, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.6912700533866882, + "num_tokens": 281001339.0, + "step": 11105 + }, + { + "epoch": 1.2196354052273226, + "grad_norm": 2.4554800987243652, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.7019243240356445, + "num_tokens": 281023763.0, + "step": 11106 + }, + { + "epoch": 1.2197452229299364, + "grad_norm": 2.037236452102661, + "learning_rate": 1e-06, + "loss": 0.9186, + "mean_token_accuracy": 0.7183351516723633, + "num_tokens": 281052846.0, + "step": 11107 + }, + { + "epoch": 1.21985504063255, + "grad_norm": 2.433884382247925, + "learning_rate": 1e-06, + "loss": 0.9349, + "mean_token_accuracy": 0.7105543613433838, + "num_tokens": 281076561.0, + "step": 11108 + }, + { + "epoch": 1.2199648583351637, + "grad_norm": 2.1338894367218018, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7094665169715881, + "num_tokens": 281107024.0, + "step": 11109 + }, + { + "epoch": 1.2200746760377772, + "grad_norm": 2.5950303077697754, + "learning_rate": 1e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7231361865997314, + "num_tokens": 281127028.0, + "step": 11110 + }, + { + "epoch": 1.220184493740391, + "grad_norm": 2.4312968254089355, + "learning_rate": 1e-06, + "loss": 0.8709, + "mean_token_accuracy": 0.7250756025314331, + "num_tokens": 281149256.0, + "step": 11111 + }, + { + "epoch": 1.2202943114430047, + "grad_norm": 2.6065833568573, + "learning_rate": 1e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.7037124633789062, + "num_tokens": 281170971.0, + "step": 11112 + }, + { + "epoch": 1.2204041291456182, + "grad_norm": 2.241668701171875, + "learning_rate": 1e-06, + "loss": 0.8408, + "mean_token_accuracy": 0.7351304292678833, + "num_tokens": 281195923.0, + "step": 11113 + }, + { + "epoch": 1.220513946848232, + "grad_norm": 2.2114405632019043, + "learning_rate": 1e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7049247622489929, + "num_tokens": 281224420.0, + "step": 11114 + }, + { + "epoch": 1.2206237645508455, + "grad_norm": 2.1836533546447754, + "learning_rate": 1e-06, + "loss": 0.9046, + "mean_token_accuracy": 0.715688169002533, + "num_tokens": 281251438.0, + "step": 11115 + }, + { + "epoch": 1.2207335822534593, + "grad_norm": 2.5895678997039795, + "learning_rate": 1e-06, + "loss": 0.9046, + "mean_token_accuracy": 0.719467282295227, + "num_tokens": 281271101.0, + "step": 11116 + }, + { + "epoch": 1.2208433999560728, + "grad_norm": 2.2375125885009766, + "learning_rate": 1e-06, + "loss": 0.8675, + "mean_token_accuracy": 0.7276147603988647, + "num_tokens": 281297189.0, + "step": 11117 + }, + { + "epoch": 1.2209532176586866, + "grad_norm": 2.627451181411743, + "learning_rate": 1e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.7213606834411621, + "num_tokens": 281318730.0, + "step": 11118 + }, + { + "epoch": 1.2210630353613001, + "grad_norm": 2.3758459091186523, + "learning_rate": 1e-06, + "loss": 0.8376, + "mean_token_accuracy": 0.7346146106719971, + "num_tokens": 281341935.0, + "step": 11119 + }, + { + "epoch": 1.2211728530639139, + "grad_norm": 2.4480490684509277, + "learning_rate": 1e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.7072927355766296, + "num_tokens": 281364743.0, + "step": 11120 + }, + { + "epoch": 1.2212826707665276, + "grad_norm": 2.057532548904419, + "learning_rate": 1e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.7216102480888367, + "num_tokens": 281395376.0, + "step": 11121 + }, + { + "epoch": 1.2213924884691412, + "grad_norm": 2.219484567642212, + "learning_rate": 1e-06, + "loss": 0.9871, + "mean_token_accuracy": 0.7024914622306824, + "num_tokens": 281422555.0, + "step": 11122 + }, + { + "epoch": 1.221502306171755, + "grad_norm": 2.220412492752075, + "learning_rate": 1e-06, + "loss": 0.8412, + "mean_token_accuracy": 0.7332603931427002, + "num_tokens": 281448401.0, + "step": 11123 + }, + { + "epoch": 1.2216121238743685, + "grad_norm": 2.255782127380371, + "learning_rate": 1e-06, + "loss": 0.944, + "mean_token_accuracy": 0.710496723651886, + "num_tokens": 281474586.0, + "step": 11124 + }, + { + "epoch": 1.2217219415769822, + "grad_norm": 2.368138551712036, + "learning_rate": 1e-06, + "loss": 0.9733, + "mean_token_accuracy": 0.6995914578437805, + "num_tokens": 281498909.0, + "step": 11125 + }, + { + "epoch": 1.221831759279596, + "grad_norm": 2.1061654090881348, + "learning_rate": 1e-06, + "loss": 0.8322, + "mean_token_accuracy": 0.7486546635627747, + "num_tokens": 281527102.0, + "step": 11126 + }, + { + "epoch": 1.2219415769822095, + "grad_norm": 2.208918571472168, + "learning_rate": 1e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.6985621452331543, + "num_tokens": 281555897.0, + "step": 11127 + }, + { + "epoch": 1.2220513946848233, + "grad_norm": 2.3406662940979004, + "learning_rate": 1e-06, + "loss": 0.9008, + "mean_token_accuracy": 0.7145932912826538, + "num_tokens": 281580537.0, + "step": 11128 + }, + { + "epoch": 1.2221612123874368, + "grad_norm": 2.1398611068725586, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7084078788757324, + "num_tokens": 281609545.0, + "step": 11129 + }, + { + "epoch": 1.2222710300900506, + "grad_norm": 2.5864787101745605, + "learning_rate": 1e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.714065670967102, + "num_tokens": 281631070.0, + "step": 11130 + }, + { + "epoch": 1.222380847792664, + "grad_norm": 2.4977338314056396, + "learning_rate": 1e-06, + "loss": 0.8693, + "mean_token_accuracy": 0.7268931865692139, + "num_tokens": 281652921.0, + "step": 11131 + }, + { + "epoch": 1.2224906654952779, + "grad_norm": 2.260819435119629, + "learning_rate": 1e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.6906394362449646, + "num_tokens": 281679791.0, + "step": 11132 + }, + { + "epoch": 1.2226004831978914, + "grad_norm": 2.217139959335327, + "learning_rate": 1e-06, + "loss": 0.8287, + "mean_token_accuracy": 0.7348670959472656, + "num_tokens": 281704877.0, + "step": 11133 + }, + { + "epoch": 1.2227103009005051, + "grad_norm": 2.3700191974639893, + "learning_rate": 1e-06, + "loss": 0.8249, + "mean_token_accuracy": 0.7364879846572876, + "num_tokens": 281727829.0, + "step": 11134 + }, + { + "epoch": 1.222820118603119, + "grad_norm": 2.3860905170440674, + "learning_rate": 1e-06, + "loss": 0.8703, + "mean_token_accuracy": 0.7313895225524902, + "num_tokens": 281751320.0, + "step": 11135 + }, + { + "epoch": 1.2229299363057324, + "grad_norm": 2.157824993133545, + "learning_rate": 1e-06, + "loss": 0.9587, + "mean_token_accuracy": 0.7038086652755737, + "num_tokens": 281780186.0, + "step": 11136 + }, + { + "epoch": 1.2230397540083462, + "grad_norm": 1.8531625270843506, + "learning_rate": 1e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.7202041149139404, + "num_tokens": 281813532.0, + "step": 11137 + }, + { + "epoch": 1.2231495717109597, + "grad_norm": 1.9058653116226196, + "learning_rate": 1e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.6892918944358826, + "num_tokens": 281848154.0, + "step": 11138 + }, + { + "epoch": 1.2232593894135735, + "grad_norm": 2.33030366897583, + "learning_rate": 1e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.7023348212242126, + "num_tokens": 281871547.0, + "step": 11139 + }, + { + "epoch": 1.2233692071161872, + "grad_norm": 2.203991651535034, + "learning_rate": 1e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.7001828551292419, + "num_tokens": 281897873.0, + "step": 11140 + }, + { + "epoch": 1.2234790248188008, + "grad_norm": 2.3442773818969727, + "learning_rate": 1e-06, + "loss": 0.8966, + "mean_token_accuracy": 0.724395751953125, + "num_tokens": 281920326.0, + "step": 11141 + }, + { + "epoch": 1.2235888425214145, + "grad_norm": 2.1671619415283203, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7030672430992126, + "num_tokens": 281948132.0, + "step": 11142 + }, + { + "epoch": 1.223698660224028, + "grad_norm": 2.3805952072143555, + "learning_rate": 1e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.7149109244346619, + "num_tokens": 281970861.0, + "step": 11143 + }, + { + "epoch": 1.2238084779266418, + "grad_norm": 2.248208999633789, + "learning_rate": 1e-06, + "loss": 0.8198, + "mean_token_accuracy": 0.7351975440979004, + "num_tokens": 281997478.0, + "step": 11144 + }, + { + "epoch": 1.2239182956292554, + "grad_norm": 2.2972350120544434, + "learning_rate": 1e-06, + "loss": 0.8847, + "mean_token_accuracy": 0.7176766395568848, + "num_tokens": 282023159.0, + "step": 11145 + }, + { + "epoch": 1.2240281133318691, + "grad_norm": 2.5552732944488525, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7239173054695129, + "num_tokens": 282046286.0, + "step": 11146 + }, + { + "epoch": 1.2241379310344827, + "grad_norm": 2.038203716278076, + "learning_rate": 1e-06, + "loss": 0.8173, + "mean_token_accuracy": 0.7390986680984497, + "num_tokens": 282073284.0, + "step": 11147 + }, + { + "epoch": 1.2242477487370964, + "grad_norm": 2.2769970893859863, + "learning_rate": 1e-06, + "loss": 0.8586, + "mean_token_accuracy": 0.7459433674812317, + "num_tokens": 282097044.0, + "step": 11148 + }, + { + "epoch": 1.2243575664397102, + "grad_norm": 2.419611692428589, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7157399654388428, + "num_tokens": 282121868.0, + "step": 11149 + }, + { + "epoch": 1.2244673841423237, + "grad_norm": 2.225955009460449, + "learning_rate": 1e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.7201574444770813, + "num_tokens": 282147649.0, + "step": 11150 + }, + { + "epoch": 1.2245772018449375, + "grad_norm": 2.44828724861145, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7071226835250854, + "num_tokens": 282171959.0, + "step": 11151 + }, + { + "epoch": 1.224687019547551, + "grad_norm": 1.9946399927139282, + "learning_rate": 1e-06, + "loss": 0.8785, + "mean_token_accuracy": 0.7206176519393921, + "num_tokens": 282203151.0, + "step": 11152 + }, + { + "epoch": 1.2247968372501647, + "grad_norm": 2.278914213180542, + "learning_rate": 1e-06, + "loss": 0.8738, + "mean_token_accuracy": 0.7265044450759888, + "num_tokens": 282227078.0, + "step": 11153 + }, + { + "epoch": 1.2249066549527785, + "grad_norm": 2.1830010414123535, + "learning_rate": 1e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.7229475975036621, + "num_tokens": 282253460.0, + "step": 11154 + }, + { + "epoch": 1.225016472655392, + "grad_norm": 2.085587978363037, + "learning_rate": 1e-06, + "loss": 0.9268, + "mean_token_accuracy": 0.7166997194290161, + "num_tokens": 282282498.0, + "step": 11155 + }, + { + "epoch": 1.2251262903580058, + "grad_norm": 2.3736863136291504, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7109994888305664, + "num_tokens": 282304972.0, + "step": 11156 + }, + { + "epoch": 1.2252361080606193, + "grad_norm": 2.2902493476867676, + "learning_rate": 1e-06, + "loss": 0.9935, + "mean_token_accuracy": 0.6984866857528687, + "num_tokens": 282330374.0, + "step": 11157 + }, + { + "epoch": 1.225345925763233, + "grad_norm": 2.254103899002075, + "learning_rate": 1e-06, + "loss": 0.896, + "mean_token_accuracy": 0.7177096009254456, + "num_tokens": 282356844.0, + "step": 11158 + }, + { + "epoch": 1.2254557434658466, + "grad_norm": 2.190255880355835, + "learning_rate": 1e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.710260272026062, + "num_tokens": 282382324.0, + "step": 11159 + }, + { + "epoch": 1.2255655611684604, + "grad_norm": 2.390824794769287, + "learning_rate": 1e-06, + "loss": 0.8788, + "mean_token_accuracy": 0.7343164682388306, + "num_tokens": 282405588.0, + "step": 11160 + }, + { + "epoch": 1.225675378871074, + "grad_norm": 2.2738566398620605, + "learning_rate": 1e-06, + "loss": 0.8715, + "mean_token_accuracy": 0.7388237118721008, + "num_tokens": 282428139.0, + "step": 11161 + }, + { + "epoch": 1.2257851965736877, + "grad_norm": 2.2118945121765137, + "learning_rate": 1e-06, + "loss": 0.9143, + "mean_token_accuracy": 0.7166666984558105, + "num_tokens": 282453373.0, + "step": 11162 + }, + { + "epoch": 1.2258950142763014, + "grad_norm": 2.279961585998535, + "learning_rate": 1e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.732149600982666, + "num_tokens": 282478039.0, + "step": 11163 + }, + { + "epoch": 1.226004831978915, + "grad_norm": 2.516894578933716, + "learning_rate": 1e-06, + "loss": 0.817, + "mean_token_accuracy": 0.7371519804000854, + "num_tokens": 282498674.0, + "step": 11164 + }, + { + "epoch": 1.2261146496815287, + "grad_norm": 2.5152902603149414, + "learning_rate": 1e-06, + "loss": 0.909, + "mean_token_accuracy": 0.7111714482307434, + "num_tokens": 282519476.0, + "step": 11165 + }, + { + "epoch": 1.2262244673841423, + "grad_norm": 2.1246485710144043, + "learning_rate": 1e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.7159274220466614, + "num_tokens": 282549176.0, + "step": 11166 + }, + { + "epoch": 1.226334285086756, + "grad_norm": 2.6429240703582764, + "learning_rate": 1e-06, + "loss": 0.9188, + "mean_token_accuracy": 0.7239608764648438, + "num_tokens": 282571183.0, + "step": 11167 + }, + { + "epoch": 1.2264441027893696, + "grad_norm": 2.0625481605529785, + "learning_rate": 1e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.7195703983306885, + "num_tokens": 282600051.0, + "step": 11168 + }, + { + "epoch": 1.2265539204919833, + "grad_norm": 2.03052020072937, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7037916779518127, + "num_tokens": 282629140.0, + "step": 11169 + }, + { + "epoch": 1.226663738194597, + "grad_norm": 2.2068722248077393, + "learning_rate": 1e-06, + "loss": 0.9494, + "mean_token_accuracy": 0.7097924947738647, + "num_tokens": 282655297.0, + "step": 11170 + }, + { + "epoch": 1.2267735558972106, + "grad_norm": 2.3472156524658203, + "learning_rate": 1e-06, + "loss": 0.8759, + "mean_token_accuracy": 0.7240892648696899, + "num_tokens": 282679697.0, + "step": 11171 + }, + { + "epoch": 1.2268833735998244, + "grad_norm": 2.222142457962036, + "learning_rate": 1e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.721585750579834, + "num_tokens": 282707074.0, + "step": 11172 + }, + { + "epoch": 1.226993191302438, + "grad_norm": 2.10840106010437, + "learning_rate": 1e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.6946843862533569, + "num_tokens": 282737167.0, + "step": 11173 + }, + { + "epoch": 1.2271030090050516, + "grad_norm": 2.2740426063537598, + "learning_rate": 1e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7180488109588623, + "num_tokens": 282763469.0, + "step": 11174 + }, + { + "epoch": 1.2272128267076652, + "grad_norm": 2.390829563140869, + "learning_rate": 1e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.7139167785644531, + "num_tokens": 282788750.0, + "step": 11175 + }, + { + "epoch": 1.227322644410279, + "grad_norm": 1.9872926473617554, + "learning_rate": 1e-06, + "loss": 0.8689, + "mean_token_accuracy": 0.7271102666854858, + "num_tokens": 282818988.0, + "step": 11176 + }, + { + "epoch": 1.2274324621128927, + "grad_norm": 2.2044894695281982, + "learning_rate": 1e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.7052919268608093, + "num_tokens": 282846266.0, + "step": 11177 + }, + { + "epoch": 1.2275422798155062, + "grad_norm": 2.2501792907714844, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7055424451828003, + "num_tokens": 282875502.0, + "step": 11178 + }, + { + "epoch": 1.22765209751812, + "grad_norm": 2.1411972045898438, + "learning_rate": 1e-06, + "loss": 0.9836, + "mean_token_accuracy": 0.6978126168251038, + "num_tokens": 282906122.0, + "step": 11179 + }, + { + "epoch": 1.2277619152207335, + "grad_norm": 2.220637083053589, + "learning_rate": 1e-06, + "loss": 0.8903, + "mean_token_accuracy": 0.7307319045066833, + "num_tokens": 282931752.0, + "step": 11180 + }, + { + "epoch": 1.2278717329233473, + "grad_norm": 2.086172103881836, + "learning_rate": 1e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.6986204981803894, + "num_tokens": 282961147.0, + "step": 11181 + }, + { + "epoch": 1.2279815506259608, + "grad_norm": 2.423875331878662, + "learning_rate": 1e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7164469957351685, + "num_tokens": 282984082.0, + "step": 11182 + }, + { + "epoch": 1.2280913683285746, + "grad_norm": 2.2648699283599854, + "learning_rate": 1e-06, + "loss": 0.9179, + "mean_token_accuracy": 0.7191357612609863, + "num_tokens": 283009198.0, + "step": 11183 + }, + { + "epoch": 1.228201186031188, + "grad_norm": 2.339315891265869, + "learning_rate": 1e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.7056784629821777, + "num_tokens": 283033978.0, + "step": 11184 + }, + { + "epoch": 1.2283110037338019, + "grad_norm": 2.6719045639038086, + "learning_rate": 1e-06, + "loss": 0.8579, + "mean_token_accuracy": 0.7304974794387817, + "num_tokens": 283052485.0, + "step": 11185 + }, + { + "epoch": 1.2284208214364156, + "grad_norm": 2.0734877586364746, + "learning_rate": 1e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.7214988470077515, + "num_tokens": 283082522.0, + "step": 11186 + }, + { + "epoch": 1.2285306391390292, + "grad_norm": 2.340120553970337, + "learning_rate": 1e-06, + "loss": 0.9282, + "mean_token_accuracy": 0.711617648601532, + "num_tokens": 283106630.0, + "step": 11187 + }, + { + "epoch": 1.228640456841643, + "grad_norm": 2.624570369720459, + "learning_rate": 1e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.7066131830215454, + "num_tokens": 283128265.0, + "step": 11188 + }, + { + "epoch": 1.2287502745442564, + "grad_norm": 2.5086512565612793, + "learning_rate": 1e-06, + "loss": 0.8789, + "mean_token_accuracy": 0.7250571846961975, + "num_tokens": 283149337.0, + "step": 11189 + }, + { + "epoch": 1.2288600922468702, + "grad_norm": 2.2459003925323486, + "learning_rate": 1e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.7103856801986694, + "num_tokens": 283175452.0, + "step": 11190 + }, + { + "epoch": 1.228969909949484, + "grad_norm": 2.400386333465576, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7121707201004028, + "num_tokens": 283200153.0, + "step": 11191 + }, + { + "epoch": 1.2290797276520975, + "grad_norm": 2.290205240249634, + "learning_rate": 1e-06, + "loss": 0.8997, + "mean_token_accuracy": 0.7194861173629761, + "num_tokens": 283225605.0, + "step": 11192 + }, + { + "epoch": 1.2291895453547113, + "grad_norm": 2.2236530780792236, + "learning_rate": 1e-06, + "loss": 1.019, + "mean_token_accuracy": 0.6975523233413696, + "num_tokens": 283250843.0, + "step": 11193 + }, + { + "epoch": 1.2292993630573248, + "grad_norm": 2.1572012901306152, + "learning_rate": 1e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.7331846952438354, + "num_tokens": 283276475.0, + "step": 11194 + }, + { + "epoch": 1.2294091807599385, + "grad_norm": 2.05092191696167, + "learning_rate": 1e-06, + "loss": 0.9196, + "mean_token_accuracy": 0.7162907719612122, + "num_tokens": 283303133.0, + "step": 11195 + }, + { + "epoch": 1.229518998462552, + "grad_norm": 2.264904022216797, + "learning_rate": 1e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7051631212234497, + "num_tokens": 283328814.0, + "step": 11196 + }, + { + "epoch": 1.2296288161651658, + "grad_norm": 2.3880465030670166, + "learning_rate": 1e-06, + "loss": 0.7936, + "mean_token_accuracy": 0.7437834739685059, + "num_tokens": 283352375.0, + "step": 11197 + }, + { + "epoch": 1.2297386338677794, + "grad_norm": 2.220250129699707, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7115100622177124, + "num_tokens": 283378452.0, + "step": 11198 + }, + { + "epoch": 1.2298484515703931, + "grad_norm": 2.6175601482391357, + "learning_rate": 1e-06, + "loss": 0.8875, + "mean_token_accuracy": 0.732393741607666, + "num_tokens": 283398162.0, + "step": 11199 + }, + { + "epoch": 1.2299582692730069, + "grad_norm": 2.3188538551330566, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7153885364532471, + "num_tokens": 283422163.0, + "step": 11200 + }, + { + "epoch": 1.2300680869756204, + "grad_norm": 2.2733469009399414, + "learning_rate": 1e-06, + "loss": 0.9127, + "mean_token_accuracy": 0.7165313959121704, + "num_tokens": 283449548.0, + "step": 11201 + }, + { + "epoch": 1.2301779046782342, + "grad_norm": 2.2782654762268066, + "learning_rate": 1e-06, + "loss": 0.9262, + "mean_token_accuracy": 0.7113107442855835, + "num_tokens": 283473763.0, + "step": 11202 + }, + { + "epoch": 1.2302877223808477, + "grad_norm": 2.246239185333252, + "learning_rate": 1e-06, + "loss": 0.8833, + "mean_token_accuracy": 0.719610333442688, + "num_tokens": 283499070.0, + "step": 11203 + }, + { + "epoch": 1.2303975400834615, + "grad_norm": 2.35955810546875, + "learning_rate": 1e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.710465669631958, + "num_tokens": 283523246.0, + "step": 11204 + }, + { + "epoch": 1.2305073577860752, + "grad_norm": 1.9681535959243774, + "learning_rate": 1e-06, + "loss": 0.974, + "mean_token_accuracy": 0.6977519989013672, + "num_tokens": 283558146.0, + "step": 11205 + }, + { + "epoch": 1.2306171754886888, + "grad_norm": 2.2692997455596924, + "learning_rate": 1e-06, + "loss": 0.8574, + "mean_token_accuracy": 0.7327834367752075, + "num_tokens": 283582908.0, + "step": 11206 + }, + { + "epoch": 1.2307269931913025, + "grad_norm": 2.4424641132354736, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.6990309953689575, + "num_tokens": 283606393.0, + "step": 11207 + }, + { + "epoch": 1.230836810893916, + "grad_norm": 2.010030508041382, + "learning_rate": 1e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.7079240083694458, + "num_tokens": 283636894.0, + "step": 11208 + }, + { + "epoch": 1.2309466285965298, + "grad_norm": 2.104388475418091, + "learning_rate": 1e-06, + "loss": 0.8406, + "mean_token_accuracy": 0.7369576096534729, + "num_tokens": 283663158.0, + "step": 11209 + }, + { + "epoch": 1.2310564462991433, + "grad_norm": 2.26326322555542, + "learning_rate": 1e-06, + "loss": 0.9446, + "mean_token_accuracy": 0.7093148231506348, + "num_tokens": 283689760.0, + "step": 11210 + }, + { + "epoch": 1.231166264001757, + "grad_norm": 2.5916171073913574, + "learning_rate": 1e-06, + "loss": 0.8898, + "mean_token_accuracy": 0.7151975631713867, + "num_tokens": 283708737.0, + "step": 11211 + }, + { + "epoch": 1.2312760817043706, + "grad_norm": 2.319514513015747, + "learning_rate": 1e-06, + "loss": 0.8158, + "mean_token_accuracy": 0.7412936687469482, + "num_tokens": 283731713.0, + "step": 11212 + }, + { + "epoch": 1.2313858994069844, + "grad_norm": 2.467827796936035, + "learning_rate": 1e-06, + "loss": 0.8204, + "mean_token_accuracy": 0.7370365858078003, + "num_tokens": 283753093.0, + "step": 11213 + }, + { + "epoch": 1.2314957171095982, + "grad_norm": 2.1979897022247314, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.7191956043243408, + "num_tokens": 283778178.0, + "step": 11214 + }, + { + "epoch": 1.2316055348122117, + "grad_norm": 2.1917645931243896, + "learning_rate": 1e-06, + "loss": 0.9843, + "mean_token_accuracy": 0.6984984874725342, + "num_tokens": 283804849.0, + "step": 11215 + }, + { + "epoch": 1.2317153525148254, + "grad_norm": 2.3538074493408203, + "learning_rate": 1e-06, + "loss": 0.9487, + "mean_token_accuracy": 0.7079575061798096, + "num_tokens": 283828465.0, + "step": 11216 + }, + { + "epoch": 1.231825170217439, + "grad_norm": 1.986614465713501, + "learning_rate": 1e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.7204031944274902, + "num_tokens": 283860505.0, + "step": 11217 + }, + { + "epoch": 1.2319349879200527, + "grad_norm": 2.308776617050171, + "learning_rate": 1e-06, + "loss": 0.8712, + "mean_token_accuracy": 0.727486252784729, + "num_tokens": 283883105.0, + "step": 11218 + }, + { + "epoch": 1.2320448056226665, + "grad_norm": 2.5210788249969482, + "learning_rate": 1e-06, + "loss": 0.8608, + "mean_token_accuracy": 0.7275505065917969, + "num_tokens": 283904402.0, + "step": 11219 + }, + { + "epoch": 1.23215462332528, + "grad_norm": 2.149827241897583, + "learning_rate": 1e-06, + "loss": 0.8666, + "mean_token_accuracy": 0.7299686074256897, + "num_tokens": 283930736.0, + "step": 11220 + }, + { + "epoch": 1.2322644410278938, + "grad_norm": 2.4404208660125732, + "learning_rate": 1e-06, + "loss": 0.837, + "mean_token_accuracy": 0.7288088798522949, + "num_tokens": 283952485.0, + "step": 11221 + }, + { + "epoch": 1.2323742587305073, + "grad_norm": 2.388876438140869, + "learning_rate": 1e-06, + "loss": 0.8343, + "mean_token_accuracy": 0.7397633790969849, + "num_tokens": 283976291.0, + "step": 11222 + }, + { + "epoch": 1.232484076433121, + "grad_norm": 2.7083842754364014, + "learning_rate": 1e-06, + "loss": 0.8289, + "mean_token_accuracy": 0.7383493185043335, + "num_tokens": 283994750.0, + "step": 11223 + }, + { + "epoch": 1.2325938941357346, + "grad_norm": 2.3472025394439697, + "learning_rate": 1e-06, + "loss": 0.952, + "mean_token_accuracy": 0.7088536024093628, + "num_tokens": 284019896.0, + "step": 11224 + }, + { + "epoch": 1.2327037118383484, + "grad_norm": 2.064964532852173, + "learning_rate": 1e-06, + "loss": 0.9012, + "mean_token_accuracy": 0.718250036239624, + "num_tokens": 284049376.0, + "step": 11225 + }, + { + "epoch": 1.232813529540962, + "grad_norm": 2.1253762245178223, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.713111400604248, + "num_tokens": 284080079.0, + "step": 11226 + }, + { + "epoch": 1.2329233472435757, + "grad_norm": 2.3076083660125732, + "learning_rate": 1e-06, + "loss": 1.0302, + "mean_token_accuracy": 0.6929727792739868, + "num_tokens": 284106282.0, + "step": 11227 + }, + { + "epoch": 1.2330331649461894, + "grad_norm": 2.2770462036132812, + "learning_rate": 1e-06, + "loss": 0.8676, + "mean_token_accuracy": 0.7267972826957703, + "num_tokens": 284130160.0, + "step": 11228 + }, + { + "epoch": 1.233142982648803, + "grad_norm": 2.2024712562561035, + "learning_rate": 1e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.7127280831336975, + "num_tokens": 284156318.0, + "step": 11229 + }, + { + "epoch": 1.2332528003514167, + "grad_norm": 2.6898539066314697, + "learning_rate": 1e-06, + "loss": 0.8199, + "mean_token_accuracy": 0.7371560335159302, + "num_tokens": 284175417.0, + "step": 11230 + }, + { + "epoch": 1.2333626180540302, + "grad_norm": 2.4861273765563965, + "learning_rate": 1e-06, + "loss": 0.9236, + "mean_token_accuracy": 0.7115585207939148, + "num_tokens": 284197279.0, + "step": 11231 + }, + { + "epoch": 1.233472435756644, + "grad_norm": 2.242973566055298, + "learning_rate": 1e-06, + "loss": 0.8972, + "mean_token_accuracy": 0.7182469367980957, + "num_tokens": 284222673.0, + "step": 11232 + }, + { + "epoch": 1.2335822534592575, + "grad_norm": 2.2365562915802, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.7132826447486877, + "num_tokens": 284247985.0, + "step": 11233 + }, + { + "epoch": 1.2336920711618713, + "grad_norm": 2.29132342338562, + "learning_rate": 1e-06, + "loss": 0.8482, + "mean_token_accuracy": 0.7336205840110779, + "num_tokens": 284273908.0, + "step": 11234 + }, + { + "epoch": 1.2338018888644848, + "grad_norm": 2.196650981903076, + "learning_rate": 1e-06, + "loss": 0.8637, + "mean_token_accuracy": 0.737714409828186, + "num_tokens": 284299554.0, + "step": 11235 + }, + { + "epoch": 1.2339117065670986, + "grad_norm": 2.5322418212890625, + "learning_rate": 1e-06, + "loss": 1.0084, + "mean_token_accuracy": 0.6965184807777405, + "num_tokens": 284323217.0, + "step": 11236 + }, + { + "epoch": 1.2340215242697123, + "grad_norm": 2.4346373081207275, + "learning_rate": 1e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.7040905952453613, + "num_tokens": 284348008.0, + "step": 11237 + }, + { + "epoch": 1.2341313419723259, + "grad_norm": 2.209865093231201, + "learning_rate": 1e-06, + "loss": 0.8763, + "mean_token_accuracy": 0.7249418497085571, + "num_tokens": 284374798.0, + "step": 11238 + }, + { + "epoch": 1.2342411596749396, + "grad_norm": 2.3858439922332764, + "learning_rate": 1e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7152597904205322, + "num_tokens": 284398048.0, + "step": 11239 + }, + { + "epoch": 1.2343509773775532, + "grad_norm": 2.738638401031494, + "learning_rate": 1e-06, + "loss": 0.8681, + "mean_token_accuracy": 0.7261970043182373, + "num_tokens": 284415405.0, + "step": 11240 + }, + { + "epoch": 1.234460795080167, + "grad_norm": 2.4420430660247803, + "learning_rate": 1e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.7088867425918579, + "num_tokens": 284439901.0, + "step": 11241 + }, + { + "epoch": 1.2345706127827807, + "grad_norm": 2.1127066612243652, + "learning_rate": 1e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.726243257522583, + "num_tokens": 284467228.0, + "step": 11242 + }, + { + "epoch": 1.2346804304853942, + "grad_norm": 2.2055225372314453, + "learning_rate": 1e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7113684415817261, + "num_tokens": 284494277.0, + "step": 11243 + }, + { + "epoch": 1.234790248188008, + "grad_norm": 2.387826681137085, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.697866678237915, + "num_tokens": 284517808.0, + "step": 11244 + }, + { + "epoch": 1.2349000658906215, + "grad_norm": 2.3111345767974854, + "learning_rate": 1e-06, + "loss": 0.8657, + "mean_token_accuracy": 0.729188084602356, + "num_tokens": 284540390.0, + "step": 11245 + }, + { + "epoch": 1.2350098835932353, + "grad_norm": 2.3034565448760986, + "learning_rate": 1e-06, + "loss": 0.8745, + "mean_token_accuracy": 0.7346156239509583, + "num_tokens": 284564908.0, + "step": 11246 + }, + { + "epoch": 1.2351197012958488, + "grad_norm": 2.055288553237915, + "learning_rate": 1e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.7090548276901245, + "num_tokens": 284595990.0, + "step": 11247 + }, + { + "epoch": 1.2352295189984626, + "grad_norm": 2.1748974323272705, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.703527569770813, + "num_tokens": 284621979.0, + "step": 11248 + }, + { + "epoch": 1.235339336701076, + "grad_norm": 2.426945447921753, + "learning_rate": 1e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.71280837059021, + "num_tokens": 284643660.0, + "step": 11249 + }, + { + "epoch": 1.2354491544036899, + "grad_norm": 2.3203868865966797, + "learning_rate": 1e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7242777347564697, + "num_tokens": 284669072.0, + "step": 11250 + }, + { + "epoch": 1.2355589721063036, + "grad_norm": 2.123457670211792, + "learning_rate": 1e-06, + "loss": 0.8579, + "mean_token_accuracy": 0.7294948697090149, + "num_tokens": 284700376.0, + "step": 11251 + }, + { + "epoch": 1.2356687898089171, + "grad_norm": 2.4442594051361084, + "learning_rate": 1e-06, + "loss": 1.0037, + "mean_token_accuracy": 0.6975256204605103, + "num_tokens": 284723799.0, + "step": 11252 + }, + { + "epoch": 1.235778607511531, + "grad_norm": 2.048332452774048, + "learning_rate": 1e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.7069187164306641, + "num_tokens": 284753786.0, + "step": 11253 + }, + { + "epoch": 1.2358884252141444, + "grad_norm": 2.306044816970825, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7034279108047485, + "num_tokens": 284779305.0, + "step": 11254 + }, + { + "epoch": 1.2359982429167582, + "grad_norm": 2.451655864715576, + "learning_rate": 1e-06, + "loss": 0.9473, + "mean_token_accuracy": 0.7086018323898315, + "num_tokens": 284804113.0, + "step": 11255 + }, + { + "epoch": 1.236108060619372, + "grad_norm": 2.7733004093170166, + "learning_rate": 1e-06, + "loss": 0.8414, + "mean_token_accuracy": 0.7335852384567261, + "num_tokens": 284821752.0, + "step": 11256 + }, + { + "epoch": 1.2362178783219855, + "grad_norm": 2.244757652282715, + "learning_rate": 1e-06, + "loss": 0.9378, + "mean_token_accuracy": 0.7102969884872437, + "num_tokens": 284846334.0, + "step": 11257 + }, + { + "epoch": 1.2363276960245992, + "grad_norm": 2.125246286392212, + "learning_rate": 1e-06, + "loss": 0.9378, + "mean_token_accuracy": 0.7098912000656128, + "num_tokens": 284874969.0, + "step": 11258 + }, + { + "epoch": 1.2364375137272128, + "grad_norm": 2.2497739791870117, + "learning_rate": 1e-06, + "loss": 0.92, + "mean_token_accuracy": 0.7171553373336792, + "num_tokens": 284900419.0, + "step": 11259 + }, + { + "epoch": 1.2365473314298265, + "grad_norm": 2.197718381881714, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7092969417572021, + "num_tokens": 284926432.0, + "step": 11260 + }, + { + "epoch": 1.23665714913244, + "grad_norm": 2.2981889247894287, + "learning_rate": 1e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.7050408124923706, + "num_tokens": 284951904.0, + "step": 11261 + }, + { + "epoch": 1.2367669668350538, + "grad_norm": 2.1218392848968506, + "learning_rate": 1e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7153152823448181, + "num_tokens": 284981361.0, + "step": 11262 + }, + { + "epoch": 1.2368767845376674, + "grad_norm": 2.2245559692382812, + "learning_rate": 1e-06, + "loss": 0.9266, + "mean_token_accuracy": 0.709835946559906, + "num_tokens": 285009636.0, + "step": 11263 + }, + { + "epoch": 1.2369866022402811, + "grad_norm": 2.322449207305908, + "learning_rate": 1e-06, + "loss": 0.8475, + "mean_token_accuracy": 0.7334744930267334, + "num_tokens": 285033589.0, + "step": 11264 + }, + { + "epoch": 1.2370964199428949, + "grad_norm": 2.256908893585205, + "learning_rate": 1e-06, + "loss": 0.9972, + "mean_token_accuracy": 0.7031539082527161, + "num_tokens": 285059170.0, + "step": 11265 + }, + { + "epoch": 1.2372062376455084, + "grad_norm": 2.438335418701172, + "learning_rate": 1e-06, + "loss": 0.7716, + "mean_token_accuracy": 0.7596746683120728, + "num_tokens": 285081799.0, + "step": 11266 + }, + { + "epoch": 1.2373160553481222, + "grad_norm": 2.433236837387085, + "learning_rate": 1e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7146768569946289, + "num_tokens": 285103780.0, + "step": 11267 + }, + { + "epoch": 1.2374258730507357, + "grad_norm": 2.338170289993286, + "learning_rate": 1e-06, + "loss": 0.8737, + "mean_token_accuracy": 0.7236274480819702, + "num_tokens": 285127043.0, + "step": 11268 + }, + { + "epoch": 1.2375356907533495, + "grad_norm": 2.291531801223755, + "learning_rate": 1e-06, + "loss": 0.8666, + "mean_token_accuracy": 0.7219986319541931, + "num_tokens": 285152913.0, + "step": 11269 + }, + { + "epoch": 1.2376455084559632, + "grad_norm": 2.240551710128784, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7186776995658875, + "num_tokens": 285177474.0, + "step": 11270 + }, + { + "epoch": 1.2377553261585768, + "grad_norm": 2.1999874114990234, + "learning_rate": 1e-06, + "loss": 0.9279, + "mean_token_accuracy": 0.7104927897453308, + "num_tokens": 285204385.0, + "step": 11271 + }, + { + "epoch": 1.2378651438611905, + "grad_norm": 2.406522750854492, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7204667329788208, + "num_tokens": 285228071.0, + "step": 11272 + }, + { + "epoch": 1.237974961563804, + "grad_norm": 2.347440481185913, + "learning_rate": 1e-06, + "loss": 0.8915, + "mean_token_accuracy": 0.7232604026794434, + "num_tokens": 285252646.0, + "step": 11273 + }, + { + "epoch": 1.2380847792664178, + "grad_norm": 2.434148073196411, + "learning_rate": 1e-06, + "loss": 0.8752, + "mean_token_accuracy": 0.7299951910972595, + "num_tokens": 285274619.0, + "step": 11274 + }, + { + "epoch": 1.2381945969690313, + "grad_norm": 2.341874361038208, + "learning_rate": 1e-06, + "loss": 0.8681, + "mean_token_accuracy": 0.7297946214675903, + "num_tokens": 285296938.0, + "step": 11275 + }, + { + "epoch": 1.238304414671645, + "grad_norm": 2.1377487182617188, + "learning_rate": 1e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.7126666307449341, + "num_tokens": 285324219.0, + "step": 11276 + }, + { + "epoch": 1.2384142323742586, + "grad_norm": 2.238384246826172, + "learning_rate": 1e-06, + "loss": 0.9174, + "mean_token_accuracy": 0.7173150181770325, + "num_tokens": 285350282.0, + "step": 11277 + }, + { + "epoch": 1.2385240500768724, + "grad_norm": 2.656435966491699, + "learning_rate": 1e-06, + "loss": 0.9242, + "mean_token_accuracy": 0.7042933106422424, + "num_tokens": 285370341.0, + "step": 11278 + }, + { + "epoch": 1.2386338677794861, + "grad_norm": 2.624537229537964, + "learning_rate": 1e-06, + "loss": 0.8164, + "mean_token_accuracy": 0.7496553659439087, + "num_tokens": 285389043.0, + "step": 11279 + }, + { + "epoch": 1.2387436854820997, + "grad_norm": 2.49762225151062, + "learning_rate": 1e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7200961112976074, + "num_tokens": 285410398.0, + "step": 11280 + }, + { + "epoch": 1.2388535031847134, + "grad_norm": 2.0214710235595703, + "learning_rate": 1e-06, + "loss": 0.9086, + "mean_token_accuracy": 0.7132447957992554, + "num_tokens": 285441894.0, + "step": 11281 + }, + { + "epoch": 1.238963320887327, + "grad_norm": 2.031973123550415, + "learning_rate": 1e-06, + "loss": 0.8839, + "mean_token_accuracy": 0.7221484780311584, + "num_tokens": 285469559.0, + "step": 11282 + }, + { + "epoch": 1.2390731385899407, + "grad_norm": 2.472891330718994, + "learning_rate": 1e-06, + "loss": 0.9242, + "mean_token_accuracy": 0.7104241847991943, + "num_tokens": 285490646.0, + "step": 11283 + }, + { + "epoch": 1.2391829562925545, + "grad_norm": 2.2110002040863037, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7238006591796875, + "num_tokens": 285516096.0, + "step": 11284 + }, + { + "epoch": 1.239292773995168, + "grad_norm": 2.3747918605804443, + "learning_rate": 1e-06, + "loss": 0.9281, + "mean_token_accuracy": 0.7072608470916748, + "num_tokens": 285541727.0, + "step": 11285 + }, + { + "epoch": 1.2394025916977818, + "grad_norm": 2.138296604156494, + "learning_rate": 1e-06, + "loss": 1.0313, + "mean_token_accuracy": 0.6797294616699219, + "num_tokens": 285572938.0, + "step": 11286 + }, + { + "epoch": 1.2395124094003953, + "grad_norm": 2.233135938644409, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7073193788528442, + "num_tokens": 285598171.0, + "step": 11287 + }, + { + "epoch": 1.239622227103009, + "grad_norm": 2.526247501373291, + "learning_rate": 1e-06, + "loss": 0.8652, + "mean_token_accuracy": 0.727473795413971, + "num_tokens": 285619103.0, + "step": 11288 + }, + { + "epoch": 1.2397320448056226, + "grad_norm": 2.183126926422119, + "learning_rate": 1e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.7136770486831665, + "num_tokens": 285646130.0, + "step": 11289 + }, + { + "epoch": 1.2398418625082364, + "grad_norm": 2.2476110458374023, + "learning_rate": 1e-06, + "loss": 0.876, + "mean_token_accuracy": 0.7264246344566345, + "num_tokens": 285674925.0, + "step": 11290 + }, + { + "epoch": 1.23995168021085, + "grad_norm": 2.0255160331726074, + "learning_rate": 1e-06, + "loss": 0.8376, + "mean_token_accuracy": 0.7346330881118774, + "num_tokens": 285702511.0, + "step": 11291 + }, + { + "epoch": 1.2400614979134637, + "grad_norm": 2.375997543334961, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7272212505340576, + "num_tokens": 285725862.0, + "step": 11292 + }, + { + "epoch": 1.2401713156160774, + "grad_norm": 2.2915046215057373, + "learning_rate": 1e-06, + "loss": 0.8573, + "mean_token_accuracy": 0.7283158302307129, + "num_tokens": 285751008.0, + "step": 11293 + }, + { + "epoch": 1.240281133318691, + "grad_norm": 2.399817943572998, + "learning_rate": 1e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.7068703174591064, + "num_tokens": 285776862.0, + "step": 11294 + }, + { + "epoch": 1.2403909510213047, + "grad_norm": 2.1958038806915283, + "learning_rate": 1e-06, + "loss": 0.9943, + "mean_token_accuracy": 0.6935615539550781, + "num_tokens": 285805344.0, + "step": 11295 + }, + { + "epoch": 1.2405007687239182, + "grad_norm": 2.3897032737731934, + "learning_rate": 1e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.7312930822372437, + "num_tokens": 285829006.0, + "step": 11296 + }, + { + "epoch": 1.240610586426532, + "grad_norm": 2.3567774295806885, + "learning_rate": 1e-06, + "loss": 0.844, + "mean_token_accuracy": 0.7476754188537598, + "num_tokens": 285853952.0, + "step": 11297 + }, + { + "epoch": 1.2407204041291455, + "grad_norm": 2.0003902912139893, + "learning_rate": 1e-06, + "loss": 0.9541, + "mean_token_accuracy": 0.7055195569992065, + "num_tokens": 285885379.0, + "step": 11298 + }, + { + "epoch": 1.2408302218317593, + "grad_norm": 2.26238751411438, + "learning_rate": 1e-06, + "loss": 0.8986, + "mean_token_accuracy": 0.7153784036636353, + "num_tokens": 285910165.0, + "step": 11299 + }, + { + "epoch": 1.2409400395343728, + "grad_norm": 2.3109419345855713, + "learning_rate": 1e-06, + "loss": 0.8548, + "mean_token_accuracy": 0.7322069406509399, + "num_tokens": 285936158.0, + "step": 11300 + }, + { + "epoch": 1.2410498572369866, + "grad_norm": 2.6262009143829346, + "learning_rate": 1e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.7257091403007507, + "num_tokens": 285956976.0, + "step": 11301 + }, + { + "epoch": 1.2411596749396003, + "grad_norm": 2.333866596221924, + "learning_rate": 1e-06, + "loss": 0.8577, + "mean_token_accuracy": 0.7275500297546387, + "num_tokens": 285979450.0, + "step": 11302 + }, + { + "epoch": 1.2412694926422139, + "grad_norm": 2.406087636947632, + "learning_rate": 1e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.7081228494644165, + "num_tokens": 286006722.0, + "step": 11303 + }, + { + "epoch": 1.2413793103448276, + "grad_norm": 2.189417600631714, + "learning_rate": 1e-06, + "loss": 1.0034, + "mean_token_accuracy": 0.6933938264846802, + "num_tokens": 286034962.0, + "step": 11304 + }, + { + "epoch": 1.2414891280474412, + "grad_norm": 2.3782386779785156, + "learning_rate": 1e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7178845405578613, + "num_tokens": 286058052.0, + "step": 11305 + }, + { + "epoch": 1.241598945750055, + "grad_norm": 2.5110039710998535, + "learning_rate": 1e-06, + "loss": 0.8021, + "mean_token_accuracy": 0.7429237961769104, + "num_tokens": 286078218.0, + "step": 11306 + }, + { + "epoch": 1.2417087634526687, + "grad_norm": 2.412841796875, + "learning_rate": 1e-06, + "loss": 0.9362, + "mean_token_accuracy": 0.7177311778068542, + "num_tokens": 286100291.0, + "step": 11307 + }, + { + "epoch": 1.2418185811552822, + "grad_norm": 2.332214117050171, + "learning_rate": 1e-06, + "loss": 0.8526, + "mean_token_accuracy": 0.7326207756996155, + "num_tokens": 286123966.0, + "step": 11308 + }, + { + "epoch": 1.241928398857896, + "grad_norm": 2.14691424369812, + "learning_rate": 1e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.720727801322937, + "num_tokens": 286150545.0, + "step": 11309 + }, + { + "epoch": 1.2420382165605095, + "grad_norm": 2.178431987762451, + "learning_rate": 1e-06, + "loss": 0.8778, + "mean_token_accuracy": 0.7213363647460938, + "num_tokens": 286176438.0, + "step": 11310 + }, + { + "epoch": 1.2421480342631233, + "grad_norm": 2.514007329940796, + "learning_rate": 1e-06, + "loss": 0.9975, + "mean_token_accuracy": 0.7005997896194458, + "num_tokens": 286198955.0, + "step": 11311 + }, + { + "epoch": 1.2422578519657368, + "grad_norm": 2.52089786529541, + "learning_rate": 1e-06, + "loss": 0.94, + "mean_token_accuracy": 0.7200259566307068, + "num_tokens": 286223874.0, + "step": 11312 + }, + { + "epoch": 1.2423676696683505, + "grad_norm": 2.2280263900756836, + "learning_rate": 1e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.7310388684272766, + "num_tokens": 286251946.0, + "step": 11313 + }, + { + "epoch": 1.242477487370964, + "grad_norm": 1.9819090366363525, + "learning_rate": 1e-06, + "loss": 0.9286, + "mean_token_accuracy": 0.7085872888565063, + "num_tokens": 286281044.0, + "step": 11314 + }, + { + "epoch": 1.2425873050735778, + "grad_norm": 2.3811593055725098, + "learning_rate": 1e-06, + "loss": 0.8592, + "mean_token_accuracy": 0.7290751338005066, + "num_tokens": 286303804.0, + "step": 11315 + }, + { + "epoch": 1.2426971227761916, + "grad_norm": 2.160080671310425, + "learning_rate": 1e-06, + "loss": 0.8986, + "mean_token_accuracy": 0.7204679250717163, + "num_tokens": 286331220.0, + "step": 11316 + }, + { + "epoch": 1.2428069404788051, + "grad_norm": 2.265315055847168, + "learning_rate": 1e-06, + "loss": 0.856, + "mean_token_accuracy": 0.7428039312362671, + "num_tokens": 286354205.0, + "step": 11317 + }, + { + "epoch": 1.242916758181419, + "grad_norm": 2.390204906463623, + "learning_rate": 1e-06, + "loss": 0.8838, + "mean_token_accuracy": 0.7208334803581238, + "num_tokens": 286376216.0, + "step": 11318 + }, + { + "epoch": 1.2430265758840324, + "grad_norm": 2.224416732788086, + "learning_rate": 1e-06, + "loss": 0.8268, + "mean_token_accuracy": 0.7410526275634766, + "num_tokens": 286399843.0, + "step": 11319 + }, + { + "epoch": 1.2431363935866462, + "grad_norm": 2.0192651748657227, + "learning_rate": 1e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.6966143846511841, + "num_tokens": 286431314.0, + "step": 11320 + }, + { + "epoch": 1.24324621128926, + "grad_norm": 2.168992519378662, + "learning_rate": 1e-06, + "loss": 0.9087, + "mean_token_accuracy": 0.7233328819274902, + "num_tokens": 286456628.0, + "step": 11321 + }, + { + "epoch": 1.2433560289918735, + "grad_norm": 2.694850206375122, + "learning_rate": 1e-06, + "loss": 0.8796, + "mean_token_accuracy": 0.7263215780258179, + "num_tokens": 286476572.0, + "step": 11322 + }, + { + "epoch": 1.2434658466944872, + "grad_norm": 2.0971508026123047, + "learning_rate": 1e-06, + "loss": 0.875, + "mean_token_accuracy": 0.7239524126052856, + "num_tokens": 286505761.0, + "step": 11323 + }, + { + "epoch": 1.2435756643971008, + "grad_norm": 2.0866858959198, + "learning_rate": 1e-06, + "loss": 0.9766, + "mean_token_accuracy": 0.6988223791122437, + "num_tokens": 286535620.0, + "step": 11324 + }, + { + "epoch": 1.2436854820997145, + "grad_norm": 2.409794807434082, + "learning_rate": 1e-06, + "loss": 0.9047, + "mean_token_accuracy": 0.723702609539032, + "num_tokens": 286559111.0, + "step": 11325 + }, + { + "epoch": 1.243795299802328, + "grad_norm": 2.163586139678955, + "learning_rate": 1e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.7316017150878906, + "num_tokens": 286588383.0, + "step": 11326 + }, + { + "epoch": 1.2439051175049418, + "grad_norm": 2.2569260597229004, + "learning_rate": 1e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.7182969450950623, + "num_tokens": 286615518.0, + "step": 11327 + }, + { + "epoch": 1.2440149352075554, + "grad_norm": 2.2594072818756104, + "learning_rate": 1e-06, + "loss": 0.8847, + "mean_token_accuracy": 0.7246990203857422, + "num_tokens": 286639364.0, + "step": 11328 + }, + { + "epoch": 1.244124752910169, + "grad_norm": 2.25539231300354, + "learning_rate": 1e-06, + "loss": 0.9096, + "mean_token_accuracy": 0.7155482769012451, + "num_tokens": 286664088.0, + "step": 11329 + }, + { + "epoch": 1.2442345706127829, + "grad_norm": 2.2344679832458496, + "learning_rate": 1e-06, + "loss": 0.8551, + "mean_token_accuracy": 0.7275612950325012, + "num_tokens": 286690660.0, + "step": 11330 + }, + { + "epoch": 1.2443443883153964, + "grad_norm": 2.5682427883148193, + "learning_rate": 1e-06, + "loss": 0.8095, + "mean_token_accuracy": 0.7428159713745117, + "num_tokens": 286711667.0, + "step": 11331 + }, + { + "epoch": 1.2444542060180102, + "grad_norm": 2.1483330726623535, + "learning_rate": 1e-06, + "loss": 0.88, + "mean_token_accuracy": 0.7197403311729431, + "num_tokens": 286739822.0, + "step": 11332 + }, + { + "epoch": 1.2445640237206237, + "grad_norm": 2.1911582946777344, + "learning_rate": 1e-06, + "loss": 0.91, + "mean_token_accuracy": 0.7279666662216187, + "num_tokens": 286766937.0, + "step": 11333 + }, + { + "epoch": 1.2446738414232374, + "grad_norm": 2.310537576675415, + "learning_rate": 1e-06, + "loss": 0.8899, + "mean_token_accuracy": 0.7248297929763794, + "num_tokens": 286789756.0, + "step": 11334 + }, + { + "epoch": 1.2447836591258512, + "grad_norm": 2.4916598796844482, + "learning_rate": 1e-06, + "loss": 0.9038, + "mean_token_accuracy": 0.7231995463371277, + "num_tokens": 286813672.0, + "step": 11335 + }, + { + "epoch": 1.2448934768284647, + "grad_norm": 2.2739953994750977, + "learning_rate": 1e-06, + "loss": 0.8615, + "mean_token_accuracy": 0.7292754054069519, + "num_tokens": 286840050.0, + "step": 11336 + }, + { + "epoch": 1.2450032945310785, + "grad_norm": 2.1887881755828857, + "learning_rate": 1e-06, + "loss": 0.8569, + "mean_token_accuracy": 0.7357890605926514, + "num_tokens": 286867103.0, + "step": 11337 + }, + { + "epoch": 1.245113112233692, + "grad_norm": 2.2741332054138184, + "learning_rate": 1e-06, + "loss": 0.9441, + "mean_token_accuracy": 0.7027904987335205, + "num_tokens": 286892690.0, + "step": 11338 + }, + { + "epoch": 1.2452229299363058, + "grad_norm": 2.3358705043792725, + "learning_rate": 1e-06, + "loss": 0.8697, + "mean_token_accuracy": 0.7259901165962219, + "num_tokens": 286913932.0, + "step": 11339 + }, + { + "epoch": 1.2453327476389193, + "grad_norm": 2.368370294570923, + "learning_rate": 1e-06, + "loss": 0.8685, + "mean_token_accuracy": 0.7268901467323303, + "num_tokens": 286938119.0, + "step": 11340 + }, + { + "epoch": 1.245442565341533, + "grad_norm": 2.171656608581543, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7079019546508789, + "num_tokens": 286963529.0, + "step": 11341 + }, + { + "epoch": 1.2455523830441466, + "grad_norm": 2.0199978351593018, + "learning_rate": 1e-06, + "loss": 0.963, + "mean_token_accuracy": 0.7016066908836365, + "num_tokens": 286993035.0, + "step": 11342 + }, + { + "epoch": 1.2456622007467604, + "grad_norm": 2.6278584003448486, + "learning_rate": 1e-06, + "loss": 0.7955, + "mean_token_accuracy": 0.7481510043144226, + "num_tokens": 287012752.0, + "step": 11343 + }, + { + "epoch": 1.2457720184493741, + "grad_norm": 2.5764853954315186, + "learning_rate": 1e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.6988160014152527, + "num_tokens": 287035942.0, + "step": 11344 + }, + { + "epoch": 1.2458818361519877, + "grad_norm": 2.2402760982513428, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7138579487800598, + "num_tokens": 287061552.0, + "step": 11345 + }, + { + "epoch": 1.2459916538546014, + "grad_norm": 2.368640661239624, + "learning_rate": 1e-06, + "loss": 0.939, + "mean_token_accuracy": 0.7126070857048035, + "num_tokens": 287084552.0, + "step": 11346 + }, + { + "epoch": 1.246101471557215, + "grad_norm": 2.66225004196167, + "learning_rate": 1e-06, + "loss": 0.8564, + "mean_token_accuracy": 0.7386640310287476, + "num_tokens": 287108123.0, + "step": 11347 + }, + { + "epoch": 1.2462112892598287, + "grad_norm": 2.1418163776397705, + "learning_rate": 1e-06, + "loss": 0.8745, + "mean_token_accuracy": 0.7233355045318604, + "num_tokens": 287135474.0, + "step": 11348 + }, + { + "epoch": 1.2463211069624422, + "grad_norm": 2.3250491619110107, + "learning_rate": 1e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.6944891214370728, + "num_tokens": 287161591.0, + "step": 11349 + }, + { + "epoch": 1.246430924665056, + "grad_norm": 2.0507469177246094, + "learning_rate": 1e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.7018681168556213, + "num_tokens": 287191606.0, + "step": 11350 + }, + { + "epoch": 1.2465407423676698, + "grad_norm": 2.1415047645568848, + "learning_rate": 1e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.7177069783210754, + "num_tokens": 287218448.0, + "step": 11351 + }, + { + "epoch": 1.2466505600702833, + "grad_norm": 2.1946144104003906, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7078403234481812, + "num_tokens": 287244907.0, + "step": 11352 + }, + { + "epoch": 1.246760377772897, + "grad_norm": 2.249812602996826, + "learning_rate": 1e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.7148203253746033, + "num_tokens": 287270496.0, + "step": 11353 + }, + { + "epoch": 1.2468701954755106, + "grad_norm": 2.537461757659912, + "learning_rate": 1e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.7152255773544312, + "num_tokens": 287294405.0, + "step": 11354 + }, + { + "epoch": 1.2469800131781243, + "grad_norm": 2.5427520275115967, + "learning_rate": 1e-06, + "loss": 0.8274, + "mean_token_accuracy": 0.7331538200378418, + "num_tokens": 287314216.0, + "step": 11355 + }, + { + "epoch": 1.2470898308807379, + "grad_norm": 2.1212213039398193, + "learning_rate": 1e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.7084729671478271, + "num_tokens": 287343564.0, + "step": 11356 + }, + { + "epoch": 1.2471996485833516, + "grad_norm": 2.01247239112854, + "learning_rate": 1e-06, + "loss": 0.9954, + "mean_token_accuracy": 0.6984602808952332, + "num_tokens": 287376896.0, + "step": 11357 + }, + { + "epoch": 1.2473094662859654, + "grad_norm": 2.204080581665039, + "learning_rate": 1e-06, + "loss": 0.8866, + "mean_token_accuracy": 0.7218279838562012, + "num_tokens": 287403218.0, + "step": 11358 + }, + { + "epoch": 1.247419283988579, + "grad_norm": 2.013892412185669, + "learning_rate": 1e-06, + "loss": 0.8809, + "mean_token_accuracy": 0.7249894738197327, + "num_tokens": 287431742.0, + "step": 11359 + }, + { + "epoch": 1.2475291016911927, + "grad_norm": 2.0904862880706787, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7038894891738892, + "num_tokens": 287461592.0, + "step": 11360 + }, + { + "epoch": 1.2476389193938062, + "grad_norm": 2.4260716438293457, + "learning_rate": 1e-06, + "loss": 0.8179, + "mean_token_accuracy": 0.7379955053329468, + "num_tokens": 287483723.0, + "step": 11361 + }, + { + "epoch": 1.24774873709642, + "grad_norm": 2.064105272293091, + "learning_rate": 1e-06, + "loss": 0.9308, + "mean_token_accuracy": 0.7112367153167725, + "num_tokens": 287511938.0, + "step": 11362 + }, + { + "epoch": 1.2478585547990335, + "grad_norm": 2.4476301670074463, + "learning_rate": 1e-06, + "loss": 0.8809, + "mean_token_accuracy": 0.7323331832885742, + "num_tokens": 287533303.0, + "step": 11363 + }, + { + "epoch": 1.2479683725016473, + "grad_norm": 2.434638023376465, + "learning_rate": 1e-06, + "loss": 0.8784, + "mean_token_accuracy": 0.728510856628418, + "num_tokens": 287556545.0, + "step": 11364 + }, + { + "epoch": 1.2480781902042608, + "grad_norm": 2.685732364654541, + "learning_rate": 1e-06, + "loss": 0.8168, + "mean_token_accuracy": 0.7349261045455933, + "num_tokens": 287574779.0, + "step": 11365 + }, + { + "epoch": 1.2481880079068746, + "grad_norm": 2.194139242172241, + "learning_rate": 1e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.7123079299926758, + "num_tokens": 287599786.0, + "step": 11366 + }, + { + "epoch": 1.2482978256094883, + "grad_norm": 2.1000325679779053, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7093605995178223, + "num_tokens": 287629964.0, + "step": 11367 + }, + { + "epoch": 1.2484076433121019, + "grad_norm": 2.131429433822632, + "learning_rate": 1e-06, + "loss": 0.8696, + "mean_token_accuracy": 0.7317938804626465, + "num_tokens": 287657061.0, + "step": 11368 + }, + { + "epoch": 1.2485174610147156, + "grad_norm": 2.5598957538604736, + "learning_rate": 1e-06, + "loss": 0.8723, + "mean_token_accuracy": 0.7300862073898315, + "num_tokens": 287678176.0, + "step": 11369 + }, + { + "epoch": 1.2486272787173291, + "grad_norm": 2.328477144241333, + "learning_rate": 1e-06, + "loss": 0.9546, + "mean_token_accuracy": 0.7074671983718872, + "num_tokens": 287702328.0, + "step": 11370 + }, + { + "epoch": 1.248737096419943, + "grad_norm": 2.098060131072998, + "learning_rate": 1e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.7091928720474243, + "num_tokens": 287732300.0, + "step": 11371 + }, + { + "epoch": 1.2488469141225567, + "grad_norm": 2.141796112060547, + "learning_rate": 1e-06, + "loss": 0.9082, + "mean_token_accuracy": 0.7282016277313232, + "num_tokens": 287760577.0, + "step": 11372 + }, + { + "epoch": 1.2489567318251702, + "grad_norm": 2.224296808242798, + "learning_rate": 1e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.7024310827255249, + "num_tokens": 287786004.0, + "step": 11373 + }, + { + "epoch": 1.249066549527784, + "grad_norm": 2.3135149478912354, + "learning_rate": 1e-06, + "loss": 0.8061, + "mean_token_accuracy": 0.7475924491882324, + "num_tokens": 287809183.0, + "step": 11374 + }, + { + "epoch": 1.2491763672303975, + "grad_norm": 2.464320421218872, + "learning_rate": 1e-06, + "loss": 0.8746, + "mean_token_accuracy": 0.7212653756141663, + "num_tokens": 287830794.0, + "step": 11375 + }, + { + "epoch": 1.2492861849330112, + "grad_norm": 2.1932120323181152, + "learning_rate": 1e-06, + "loss": 1.0042, + "mean_token_accuracy": 0.6896529197692871, + "num_tokens": 287858261.0, + "step": 11376 + }, + { + "epoch": 1.2493960026356248, + "grad_norm": 2.4542832374572754, + "learning_rate": 1e-06, + "loss": 0.8003, + "mean_token_accuracy": 0.7470238208770752, + "num_tokens": 287879317.0, + "step": 11377 + }, + { + "epoch": 1.2495058203382385, + "grad_norm": 2.5184082984924316, + "learning_rate": 1e-06, + "loss": 0.903, + "mean_token_accuracy": 0.724592924118042, + "num_tokens": 287900427.0, + "step": 11378 + }, + { + "epoch": 1.249615638040852, + "grad_norm": 2.090420722961426, + "learning_rate": 1e-06, + "loss": 0.9268, + "mean_token_accuracy": 0.7129827737808228, + "num_tokens": 287927663.0, + "step": 11379 + }, + { + "epoch": 1.2497254557434658, + "grad_norm": 2.1828436851501465, + "learning_rate": 1e-06, + "loss": 0.8605, + "mean_token_accuracy": 0.7357711791992188, + "num_tokens": 287953163.0, + "step": 11380 + }, + { + "epoch": 1.2498352734460796, + "grad_norm": 2.2304697036743164, + "learning_rate": 1e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7093484997749329, + "num_tokens": 287978476.0, + "step": 11381 + }, + { + "epoch": 1.2499450911486931, + "grad_norm": 2.0811140537261963, + "learning_rate": 1e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.7108516693115234, + "num_tokens": 288005111.0, + "step": 11382 + }, + { + "epoch": 1.2500549088513069, + "grad_norm": 2.1483283042907715, + "learning_rate": 1e-06, + "loss": 0.9281, + "mean_token_accuracy": 0.7046710848808289, + "num_tokens": 288033259.0, + "step": 11383 + }, + { + "epoch": 1.2501647265539204, + "grad_norm": 2.0930047035217285, + "learning_rate": 1e-06, + "loss": 0.9071, + "mean_token_accuracy": 0.7175551056861877, + "num_tokens": 288058490.0, + "step": 11384 + }, + { + "epoch": 1.2502745442565342, + "grad_norm": 2.3828532695770264, + "learning_rate": 1e-06, + "loss": 0.8759, + "mean_token_accuracy": 0.7215559482574463, + "num_tokens": 288082333.0, + "step": 11385 + }, + { + "epoch": 1.250384361959148, + "grad_norm": 2.648566722869873, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7187023162841797, + "num_tokens": 288103096.0, + "step": 11386 + }, + { + "epoch": 1.2504941796617615, + "grad_norm": 2.3766252994537354, + "learning_rate": 1e-06, + "loss": 0.8919, + "mean_token_accuracy": 0.7159059643745422, + "num_tokens": 288124953.0, + "step": 11387 + }, + { + "epoch": 1.2506039973643752, + "grad_norm": 2.456627607345581, + "learning_rate": 1e-06, + "loss": 0.8783, + "mean_token_accuracy": 0.7202062010765076, + "num_tokens": 288146932.0, + "step": 11388 + }, + { + "epoch": 1.2507138150669888, + "grad_norm": 2.330833911895752, + "learning_rate": 1e-06, + "loss": 0.802, + "mean_token_accuracy": 0.7516966462135315, + "num_tokens": 288171287.0, + "step": 11389 + }, + { + "epoch": 1.2508236327696025, + "grad_norm": 1.9845962524414062, + "learning_rate": 1e-06, + "loss": 1.0335, + "mean_token_accuracy": 0.6841204762458801, + "num_tokens": 288203571.0, + "step": 11390 + }, + { + "epoch": 1.250933450472216, + "grad_norm": 2.1491212844848633, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7072337865829468, + "num_tokens": 288234086.0, + "step": 11391 + }, + { + "epoch": 1.2510432681748298, + "grad_norm": 2.3076212406158447, + "learning_rate": 1e-06, + "loss": 0.8674, + "mean_token_accuracy": 0.7284066677093506, + "num_tokens": 288257367.0, + "step": 11392 + }, + { + "epoch": 1.2511530858774433, + "grad_norm": 2.6138052940368652, + "learning_rate": 1e-06, + "loss": 0.8004, + "mean_token_accuracy": 0.7443820238113403, + "num_tokens": 288276425.0, + "step": 11393 + }, + { + "epoch": 1.251262903580057, + "grad_norm": 2.2354576587677, + "learning_rate": 1e-06, + "loss": 0.8409, + "mean_token_accuracy": 0.7371082305908203, + "num_tokens": 288301912.0, + "step": 11394 + }, + { + "epoch": 1.2513727212826709, + "grad_norm": 2.5077595710754395, + "learning_rate": 1e-06, + "loss": 0.8308, + "mean_token_accuracy": 0.7335299253463745, + "num_tokens": 288322806.0, + "step": 11395 + }, + { + "epoch": 1.2514825389852844, + "grad_norm": 2.5176689624786377, + "learning_rate": 1e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7132601141929626, + "num_tokens": 288346637.0, + "step": 11396 + }, + { + "epoch": 1.2515923566878981, + "grad_norm": 2.483898401260376, + "learning_rate": 1e-06, + "loss": 0.8482, + "mean_token_accuracy": 0.7437351942062378, + "num_tokens": 288368652.0, + "step": 11397 + }, + { + "epoch": 1.2517021743905117, + "grad_norm": 2.0893518924713135, + "learning_rate": 1e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.7122229337692261, + "num_tokens": 288397851.0, + "step": 11398 + }, + { + "epoch": 1.2518119920931254, + "grad_norm": 2.251042127609253, + "learning_rate": 1e-06, + "loss": 0.8968, + "mean_token_accuracy": 0.727637529373169, + "num_tokens": 288424575.0, + "step": 11399 + }, + { + "epoch": 1.2519218097957392, + "grad_norm": 2.4203720092773438, + "learning_rate": 1e-06, + "loss": 0.8653, + "mean_token_accuracy": 0.7317326664924622, + "num_tokens": 288447942.0, + "step": 11400 + }, + { + "epoch": 1.2520316274983527, + "grad_norm": 2.2006497383117676, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7091248035430908, + "num_tokens": 288474559.0, + "step": 11401 + }, + { + "epoch": 1.2521414452009663, + "grad_norm": 2.2234108448028564, + "learning_rate": 1e-06, + "loss": 0.9586, + "mean_token_accuracy": 0.7086646556854248, + "num_tokens": 288500802.0, + "step": 11402 + }, + { + "epoch": 1.25225126290358, + "grad_norm": 2.408632516860962, + "learning_rate": 1e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7069190740585327, + "num_tokens": 288525609.0, + "step": 11403 + }, + { + "epoch": 1.2523610806061938, + "grad_norm": 2.43204927444458, + "learning_rate": 1e-06, + "loss": 0.8541, + "mean_token_accuracy": 0.7310173511505127, + "num_tokens": 288546733.0, + "step": 11404 + }, + { + "epoch": 1.2524708983088073, + "grad_norm": 2.1288018226623535, + "learning_rate": 1e-06, + "loss": 0.8948, + "mean_token_accuracy": 0.7194191217422485, + "num_tokens": 288573282.0, + "step": 11405 + }, + { + "epoch": 1.252580716011421, + "grad_norm": 2.2874767780303955, + "learning_rate": 1e-06, + "loss": 0.8328, + "mean_token_accuracy": 0.7347941398620605, + "num_tokens": 288597603.0, + "step": 11406 + }, + { + "epoch": 1.2526905337140346, + "grad_norm": 2.363157272338867, + "learning_rate": 1e-06, + "loss": 0.8294, + "mean_token_accuracy": 0.7325649261474609, + "num_tokens": 288621984.0, + "step": 11407 + }, + { + "epoch": 1.2528003514166484, + "grad_norm": 2.1527137756347656, + "learning_rate": 1e-06, + "loss": 0.8301, + "mean_token_accuracy": 0.7401360273361206, + "num_tokens": 288650002.0, + "step": 11408 + }, + { + "epoch": 1.2529101691192621, + "grad_norm": 2.4649572372436523, + "learning_rate": 1e-06, + "loss": 0.858, + "mean_token_accuracy": 0.7271844148635864, + "num_tokens": 288670797.0, + "step": 11409 + }, + { + "epoch": 1.2530199868218757, + "grad_norm": 2.4291553497314453, + "learning_rate": 1e-06, + "loss": 0.9256, + "mean_token_accuracy": 0.7214019298553467, + "num_tokens": 288693296.0, + "step": 11410 + }, + { + "epoch": 1.2531298045244894, + "grad_norm": 2.152082681655884, + "learning_rate": 1e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.7225949764251709, + "num_tokens": 288719396.0, + "step": 11411 + }, + { + "epoch": 1.253239622227103, + "grad_norm": 2.535731315612793, + "learning_rate": 1e-06, + "loss": 0.9471, + "mean_token_accuracy": 0.6998848915100098, + "num_tokens": 288741224.0, + "step": 11412 + }, + { + "epoch": 1.2533494399297167, + "grad_norm": 2.2286765575408936, + "learning_rate": 1e-06, + "loss": 0.8527, + "mean_token_accuracy": 0.7331677079200745, + "num_tokens": 288766793.0, + "step": 11413 + }, + { + "epoch": 1.2534592576323305, + "grad_norm": 2.2128517627716064, + "learning_rate": 1e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.7227683067321777, + "num_tokens": 288793516.0, + "step": 11414 + }, + { + "epoch": 1.253569075334944, + "grad_norm": 2.1102821826934814, + "learning_rate": 1e-06, + "loss": 0.8671, + "mean_token_accuracy": 0.7227354049682617, + "num_tokens": 288822305.0, + "step": 11415 + }, + { + "epoch": 1.2536788930375575, + "grad_norm": 2.2135257720947266, + "learning_rate": 1e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.7096565961837769, + "num_tokens": 288850701.0, + "step": 11416 + }, + { + "epoch": 1.2537887107401713, + "grad_norm": 2.069397211074829, + "learning_rate": 1e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.715597927570343, + "num_tokens": 288879297.0, + "step": 11417 + }, + { + "epoch": 1.253898528442785, + "grad_norm": 2.198019504547119, + "learning_rate": 1e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7281670570373535, + "num_tokens": 288905326.0, + "step": 11418 + }, + { + "epoch": 1.2540083461453986, + "grad_norm": 2.308328628540039, + "learning_rate": 1e-06, + "loss": 0.8944, + "mean_token_accuracy": 0.7217029929161072, + "num_tokens": 288930430.0, + "step": 11419 + }, + { + "epoch": 1.2541181638480123, + "grad_norm": 2.1242868900299072, + "learning_rate": 1e-06, + "loss": 0.8852, + "mean_token_accuracy": 0.718177080154419, + "num_tokens": 288958010.0, + "step": 11420 + }, + { + "epoch": 1.2542279815506259, + "grad_norm": 2.1180169582366943, + "learning_rate": 1e-06, + "loss": 0.9117, + "mean_token_accuracy": 0.7197704911231995, + "num_tokens": 288986997.0, + "step": 11421 + }, + { + "epoch": 1.2543377992532396, + "grad_norm": 2.07914662361145, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7135891914367676, + "num_tokens": 289016915.0, + "step": 11422 + }, + { + "epoch": 1.2544476169558534, + "grad_norm": 2.1712305545806885, + "learning_rate": 1e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.7317850589752197, + "num_tokens": 289043029.0, + "step": 11423 + }, + { + "epoch": 1.254557434658467, + "grad_norm": 2.100691556930542, + "learning_rate": 1e-06, + "loss": 0.8855, + "mean_token_accuracy": 0.7235341668128967, + "num_tokens": 289070457.0, + "step": 11424 + }, + { + "epoch": 1.2546672523610807, + "grad_norm": 2.4080443382263184, + "learning_rate": 1e-06, + "loss": 0.8681, + "mean_token_accuracy": 0.733639121055603, + "num_tokens": 289092496.0, + "step": 11425 + }, + { + "epoch": 1.2547770700636942, + "grad_norm": 2.520517349243164, + "learning_rate": 1e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7034751176834106, + "num_tokens": 289114912.0, + "step": 11426 + }, + { + "epoch": 1.254886887766308, + "grad_norm": 2.1702284812927246, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.712346076965332, + "num_tokens": 289142383.0, + "step": 11427 + }, + { + "epoch": 1.2549967054689217, + "grad_norm": 2.5230724811553955, + "learning_rate": 1e-06, + "loss": 0.861, + "mean_token_accuracy": 0.7330226898193359, + "num_tokens": 289162088.0, + "step": 11428 + }, + { + "epoch": 1.2551065231715353, + "grad_norm": 2.32064151763916, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7265628576278687, + "num_tokens": 289187385.0, + "step": 11429 + }, + { + "epoch": 1.2552163408741488, + "grad_norm": 2.0930309295654297, + "learning_rate": 1e-06, + "loss": 0.971, + "mean_token_accuracy": 0.6955408453941345, + "num_tokens": 289218809.0, + "step": 11430 + }, + { + "epoch": 1.2553261585767626, + "grad_norm": 2.649996280670166, + "learning_rate": 1e-06, + "loss": 0.8742, + "mean_token_accuracy": 0.7258892059326172, + "num_tokens": 289237532.0, + "step": 11431 + }, + { + "epoch": 1.2554359762793763, + "grad_norm": 2.458603620529175, + "learning_rate": 1e-06, + "loss": 0.8564, + "mean_token_accuracy": 0.7340492010116577, + "num_tokens": 289260332.0, + "step": 11432 + }, + { + "epoch": 1.2555457939819898, + "grad_norm": 2.1491830348968506, + "learning_rate": 1e-06, + "loss": 0.8776, + "mean_token_accuracy": 0.7271736860275269, + "num_tokens": 289287641.0, + "step": 11433 + }, + { + "epoch": 1.2556556116846036, + "grad_norm": 2.2045841217041016, + "learning_rate": 1e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.7144761681556702, + "num_tokens": 289312873.0, + "step": 11434 + }, + { + "epoch": 1.2557654293872171, + "grad_norm": 2.231121301651001, + "learning_rate": 1e-06, + "loss": 0.8757, + "mean_token_accuracy": 0.7264766097068787, + "num_tokens": 289337525.0, + "step": 11435 + }, + { + "epoch": 1.255875247089831, + "grad_norm": 1.994126558303833, + "learning_rate": 1e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.692386269569397, + "num_tokens": 289372162.0, + "step": 11436 + }, + { + "epoch": 1.2559850647924446, + "grad_norm": 2.212707757949829, + "learning_rate": 1e-06, + "loss": 0.8531, + "mean_token_accuracy": 0.7293692231178284, + "num_tokens": 289397230.0, + "step": 11437 + }, + { + "epoch": 1.2560948824950582, + "grad_norm": 2.123324155807495, + "learning_rate": 1e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.7087763547897339, + "num_tokens": 289426268.0, + "step": 11438 + }, + { + "epoch": 1.256204700197672, + "grad_norm": 2.163705348968506, + "learning_rate": 1e-06, + "loss": 0.97, + "mean_token_accuracy": 0.7052502632141113, + "num_tokens": 289456030.0, + "step": 11439 + }, + { + "epoch": 1.2563145179002855, + "grad_norm": 2.5924408435821533, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7060498595237732, + "num_tokens": 289476934.0, + "step": 11440 + }, + { + "epoch": 1.2564243356028992, + "grad_norm": 2.229918956756592, + "learning_rate": 1e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.7177944183349609, + "num_tokens": 289501286.0, + "step": 11441 + }, + { + "epoch": 1.2565341533055128, + "grad_norm": 2.6012187004089355, + "learning_rate": 1e-06, + "loss": 0.8785, + "mean_token_accuracy": 0.7283223271369934, + "num_tokens": 289522011.0, + "step": 11442 + }, + { + "epoch": 1.2566439710081265, + "grad_norm": 2.0588080883026123, + "learning_rate": 1e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.714738130569458, + "num_tokens": 289551109.0, + "step": 11443 + }, + { + "epoch": 1.25675378871074, + "grad_norm": 2.592738628387451, + "learning_rate": 1e-06, + "loss": 0.798, + "mean_token_accuracy": 0.7474919557571411, + "num_tokens": 289569047.0, + "step": 11444 + }, + { + "epoch": 1.2568636064133538, + "grad_norm": 2.2219669818878174, + "learning_rate": 1e-06, + "loss": 0.8411, + "mean_token_accuracy": 0.7316436767578125, + "num_tokens": 289593260.0, + "step": 11445 + }, + { + "epoch": 1.2569734241159676, + "grad_norm": 2.2101407051086426, + "learning_rate": 1e-06, + "loss": 0.8739, + "mean_token_accuracy": 0.7310938835144043, + "num_tokens": 289619071.0, + "step": 11446 + }, + { + "epoch": 1.257083241818581, + "grad_norm": 2.1607816219329834, + "learning_rate": 1e-06, + "loss": 0.8962, + "mean_token_accuracy": 0.723235011100769, + "num_tokens": 289647095.0, + "step": 11447 + }, + { + "epoch": 1.2571930595211949, + "grad_norm": 2.285008668899536, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7099770903587341, + "num_tokens": 289671720.0, + "step": 11448 + }, + { + "epoch": 1.2573028772238084, + "grad_norm": 2.1048295497894287, + "learning_rate": 1e-06, + "loss": 0.8803, + "mean_token_accuracy": 0.7242691516876221, + "num_tokens": 289699529.0, + "step": 11449 + }, + { + "epoch": 1.2574126949264222, + "grad_norm": 2.41976261138916, + "learning_rate": 1e-06, + "loss": 0.867, + "mean_token_accuracy": 0.7206790447235107, + "num_tokens": 289720807.0, + "step": 11450 + }, + { + "epoch": 1.257522512629036, + "grad_norm": 1.9746787548065186, + "learning_rate": 1e-06, + "loss": 0.9191, + "mean_token_accuracy": 0.7129296064376831, + "num_tokens": 289752390.0, + "step": 11451 + }, + { + "epoch": 1.2576323303316495, + "grad_norm": 2.494468927383423, + "learning_rate": 1e-06, + "loss": 0.8713, + "mean_token_accuracy": 0.733036458492279, + "num_tokens": 289773756.0, + "step": 11452 + }, + { + "epoch": 1.257742148034263, + "grad_norm": 2.3593389987945557, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.7109720706939697, + "num_tokens": 289798583.0, + "step": 11453 + }, + { + "epoch": 1.2578519657368767, + "grad_norm": 2.8244335651397705, + "learning_rate": 1e-06, + "loss": 0.8203, + "mean_token_accuracy": 0.7414454817771912, + "num_tokens": 289816317.0, + "step": 11454 + }, + { + "epoch": 1.2579617834394905, + "grad_norm": 2.0755934715270996, + "learning_rate": 1e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.7198910713195801, + "num_tokens": 289844274.0, + "step": 11455 + }, + { + "epoch": 1.258071601142104, + "grad_norm": 2.2481186389923096, + "learning_rate": 1e-06, + "loss": 0.871, + "mean_token_accuracy": 0.7275314927101135, + "num_tokens": 289870944.0, + "step": 11456 + }, + { + "epoch": 1.2581814188447178, + "grad_norm": 2.344601631164551, + "learning_rate": 1e-06, + "loss": 0.8858, + "mean_token_accuracy": 0.7208059430122375, + "num_tokens": 289896193.0, + "step": 11457 + }, + { + "epoch": 1.2582912365473313, + "grad_norm": 2.101637601852417, + "learning_rate": 1e-06, + "loss": 0.8611, + "mean_token_accuracy": 0.7298864126205444, + "num_tokens": 289924964.0, + "step": 11458 + }, + { + "epoch": 1.258401054249945, + "grad_norm": 1.9476420879364014, + "learning_rate": 1e-06, + "loss": 0.8478, + "mean_token_accuracy": 0.7348476648330688, + "num_tokens": 289956950.0, + "step": 11459 + }, + { + "epoch": 1.2585108719525588, + "grad_norm": 2.0060465335845947, + "learning_rate": 1e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.708179771900177, + "num_tokens": 289989658.0, + "step": 11460 + }, + { + "epoch": 1.2586206896551724, + "grad_norm": 2.0753304958343506, + "learning_rate": 1e-06, + "loss": 0.8468, + "mean_token_accuracy": 0.7343699336051941, + "num_tokens": 290018271.0, + "step": 11461 + }, + { + "epoch": 1.2587305073577861, + "grad_norm": 2.350163698196411, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7069908380508423, + "num_tokens": 290041356.0, + "step": 11462 + }, + { + "epoch": 1.2588403250603997, + "grad_norm": 2.3046693801879883, + "learning_rate": 1e-06, + "loss": 0.8774, + "mean_token_accuracy": 0.72170090675354, + "num_tokens": 290067578.0, + "step": 11463 + }, + { + "epoch": 1.2589501427630134, + "grad_norm": 1.9811289310455322, + "learning_rate": 1e-06, + "loss": 0.9936, + "mean_token_accuracy": 0.693120539188385, + "num_tokens": 290101049.0, + "step": 11464 + }, + { + "epoch": 1.2590599604656272, + "grad_norm": 2.632141590118408, + "learning_rate": 1e-06, + "loss": 0.8826, + "mean_token_accuracy": 0.7379939556121826, + "num_tokens": 290119830.0, + "step": 11465 + }, + { + "epoch": 1.2591697781682407, + "grad_norm": 2.302184581756592, + "learning_rate": 1e-06, + "loss": 0.8618, + "mean_token_accuracy": 0.7341063022613525, + "num_tokens": 290143990.0, + "step": 11466 + }, + { + "epoch": 1.2592795958708543, + "grad_norm": 2.0033063888549805, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.6977706551551819, + "num_tokens": 290173977.0, + "step": 11467 + }, + { + "epoch": 1.259389413573468, + "grad_norm": 2.292226552963257, + "learning_rate": 1e-06, + "loss": 0.8279, + "mean_token_accuracy": 0.7413312196731567, + "num_tokens": 290195412.0, + "step": 11468 + }, + { + "epoch": 1.2594992312760818, + "grad_norm": 2.1349096298217773, + "learning_rate": 1e-06, + "loss": 0.9066, + "mean_token_accuracy": 0.7178319096565247, + "num_tokens": 290220504.0, + "step": 11469 + }, + { + "epoch": 1.2596090489786953, + "grad_norm": 2.2988243103027344, + "learning_rate": 1e-06, + "loss": 0.9894, + "mean_token_accuracy": 0.6993458271026611, + "num_tokens": 290245556.0, + "step": 11470 + }, + { + "epoch": 1.259718866681309, + "grad_norm": 2.1869144439697266, + "learning_rate": 1e-06, + "loss": 0.9172, + "mean_token_accuracy": 0.7215419411659241, + "num_tokens": 290273308.0, + "step": 11471 + }, + { + "epoch": 1.2598286843839226, + "grad_norm": 2.1066713333129883, + "learning_rate": 1e-06, + "loss": 0.8879, + "mean_token_accuracy": 0.733288049697876, + "num_tokens": 290301361.0, + "step": 11472 + }, + { + "epoch": 1.2599385020865363, + "grad_norm": 2.0911097526550293, + "learning_rate": 1e-06, + "loss": 0.9455, + "mean_token_accuracy": 0.7113388180732727, + "num_tokens": 290328086.0, + "step": 11473 + }, + { + "epoch": 1.26004831978915, + "grad_norm": 2.021758556365967, + "learning_rate": 1e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.7123298645019531, + "num_tokens": 290357196.0, + "step": 11474 + }, + { + "epoch": 1.2601581374917636, + "grad_norm": 2.2281694412231445, + "learning_rate": 1e-06, + "loss": 0.8814, + "mean_token_accuracy": 0.7283689379692078, + "num_tokens": 290383403.0, + "step": 11475 + }, + { + "epoch": 1.2602679551943774, + "grad_norm": 2.2151174545288086, + "learning_rate": 1e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.7014470100402832, + "num_tokens": 290411880.0, + "step": 11476 + }, + { + "epoch": 1.260377772896991, + "grad_norm": 2.424093723297119, + "learning_rate": 1e-06, + "loss": 0.8291, + "mean_token_accuracy": 0.740936279296875, + "num_tokens": 290434019.0, + "step": 11477 + }, + { + "epoch": 1.2604875905996047, + "grad_norm": 2.111645221710205, + "learning_rate": 1e-06, + "loss": 0.8649, + "mean_token_accuracy": 0.7246890664100647, + "num_tokens": 290462222.0, + "step": 11478 + }, + { + "epoch": 1.2605974083022184, + "grad_norm": 2.556910991668701, + "learning_rate": 1e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.7101414203643799, + "num_tokens": 290484453.0, + "step": 11479 + }, + { + "epoch": 1.260707226004832, + "grad_norm": 2.377902030944824, + "learning_rate": 1e-06, + "loss": 0.8488, + "mean_token_accuracy": 0.7327631711959839, + "num_tokens": 290506761.0, + "step": 11480 + }, + { + "epoch": 1.2608170437074455, + "grad_norm": 2.649664878845215, + "learning_rate": 1e-06, + "loss": 0.9214, + "mean_token_accuracy": 0.7150945663452148, + "num_tokens": 290527844.0, + "step": 11481 + }, + { + "epoch": 1.2609268614100593, + "grad_norm": 2.189286947250366, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7167998552322388, + "num_tokens": 290555386.0, + "step": 11482 + }, + { + "epoch": 1.261036679112673, + "grad_norm": 2.2296926975250244, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7018554210662842, + "num_tokens": 290582196.0, + "step": 11483 + }, + { + "epoch": 1.2611464968152866, + "grad_norm": 2.1236865520477295, + "learning_rate": 1e-06, + "loss": 0.922, + "mean_token_accuracy": 0.7149919867515564, + "num_tokens": 290610290.0, + "step": 11484 + }, + { + "epoch": 1.2612563145179003, + "grad_norm": 2.234876871109009, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7054643630981445, + "num_tokens": 290636689.0, + "step": 11485 + }, + { + "epoch": 1.2613661322205139, + "grad_norm": 2.4174630641937256, + "learning_rate": 1e-06, + "loss": 0.8717, + "mean_token_accuracy": 0.7299336791038513, + "num_tokens": 290659679.0, + "step": 11486 + }, + { + "epoch": 1.2614759499231276, + "grad_norm": 2.2778897285461426, + "learning_rate": 1e-06, + "loss": 0.8063, + "mean_token_accuracy": 0.7435683608055115, + "num_tokens": 290684150.0, + "step": 11487 + }, + { + "epoch": 1.2615857676257414, + "grad_norm": 1.9819345474243164, + "learning_rate": 1e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7185944318771362, + "num_tokens": 290716126.0, + "step": 11488 + }, + { + "epoch": 1.261695585328355, + "grad_norm": 2.3420276641845703, + "learning_rate": 1e-06, + "loss": 0.7853, + "mean_token_accuracy": 0.7472842931747437, + "num_tokens": 290735991.0, + "step": 11489 + }, + { + "epoch": 1.2618054030309687, + "grad_norm": 2.3961219787597656, + "learning_rate": 1e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.7164653539657593, + "num_tokens": 290759294.0, + "step": 11490 + }, + { + "epoch": 1.2619152207335822, + "grad_norm": 2.5620760917663574, + "learning_rate": 1e-06, + "loss": 0.8457, + "mean_token_accuracy": 0.7309946417808533, + "num_tokens": 290778516.0, + "step": 11491 + }, + { + "epoch": 1.262025038436196, + "grad_norm": 2.392019510269165, + "learning_rate": 1e-06, + "loss": 0.729, + "mean_token_accuracy": 0.7625359296798706, + "num_tokens": 290799276.0, + "step": 11492 + }, + { + "epoch": 1.2621348561388097, + "grad_norm": 2.227199077606201, + "learning_rate": 1e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.7103689908981323, + "num_tokens": 290825999.0, + "step": 11493 + }, + { + "epoch": 1.2622446738414232, + "grad_norm": 2.1980764865875244, + "learning_rate": 1e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.7039072513580322, + "num_tokens": 290854705.0, + "step": 11494 + }, + { + "epoch": 1.2623544915440368, + "grad_norm": 2.2944774627685547, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7133119702339172, + "num_tokens": 290881400.0, + "step": 11495 + }, + { + "epoch": 1.2624643092466505, + "grad_norm": 2.3372955322265625, + "learning_rate": 1e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.7078068256378174, + "num_tokens": 290906648.0, + "step": 11496 + }, + { + "epoch": 1.2625741269492643, + "grad_norm": 2.227487087249756, + "learning_rate": 1e-06, + "loss": 1.0027, + "mean_token_accuracy": 0.6884242296218872, + "num_tokens": 290932222.0, + "step": 11497 + }, + { + "epoch": 1.2626839446518778, + "grad_norm": 2.5281801223754883, + "learning_rate": 1e-06, + "loss": 0.8624, + "mean_token_accuracy": 0.7270640730857849, + "num_tokens": 290952936.0, + "step": 11498 + }, + { + "epoch": 1.2627937623544916, + "grad_norm": 2.325488805770874, + "learning_rate": 1e-06, + "loss": 0.9035, + "mean_token_accuracy": 0.7193043231964111, + "num_tokens": 290975672.0, + "step": 11499 + }, + { + "epoch": 1.2629035800571051, + "grad_norm": 2.349182605743408, + "learning_rate": 1e-06, + "loss": 0.9135, + "mean_token_accuracy": 0.7102062106132507, + "num_tokens": 290999028.0, + "step": 11500 + }, + { + "epoch": 1.2630133977597189, + "grad_norm": 2.339111566543579, + "learning_rate": 1e-06, + "loss": 0.86, + "mean_token_accuracy": 0.7269098162651062, + "num_tokens": 291023715.0, + "step": 11501 + }, + { + "epoch": 1.2631232154623326, + "grad_norm": 2.3327314853668213, + "learning_rate": 1e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.7302538156509399, + "num_tokens": 291049130.0, + "step": 11502 + }, + { + "epoch": 1.2632330331649462, + "grad_norm": 2.5143632888793945, + "learning_rate": 1e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.7197368741035461, + "num_tokens": 291071385.0, + "step": 11503 + }, + { + "epoch": 1.26334285086756, + "grad_norm": 2.007944107055664, + "learning_rate": 1e-06, + "loss": 0.8286, + "mean_token_accuracy": 0.7373735904693604, + "num_tokens": 291100754.0, + "step": 11504 + }, + { + "epoch": 1.2634526685701735, + "grad_norm": 2.2891900539398193, + "learning_rate": 1e-06, + "loss": 0.9237, + "mean_token_accuracy": 0.7157524824142456, + "num_tokens": 291126092.0, + "step": 11505 + }, + { + "epoch": 1.2635624862727872, + "grad_norm": 2.419388771057129, + "learning_rate": 1e-06, + "loss": 0.7504, + "mean_token_accuracy": 0.7565351128578186, + "num_tokens": 291148287.0, + "step": 11506 + }, + { + "epoch": 1.2636723039754008, + "grad_norm": 2.436622381210327, + "learning_rate": 1e-06, + "loss": 0.8805, + "mean_token_accuracy": 0.7323482036590576, + "num_tokens": 291169633.0, + "step": 11507 + }, + { + "epoch": 1.2637821216780145, + "grad_norm": 2.1666784286499023, + "learning_rate": 1e-06, + "loss": 0.944, + "mean_token_accuracy": 0.7109028697013855, + "num_tokens": 291196939.0, + "step": 11508 + }, + { + "epoch": 1.263891939380628, + "grad_norm": 2.4029877185821533, + "learning_rate": 1e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.7131097316741943, + "num_tokens": 291219352.0, + "step": 11509 + }, + { + "epoch": 1.2640017570832418, + "grad_norm": 2.2014808654785156, + "learning_rate": 1e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.7049043774604797, + "num_tokens": 291247496.0, + "step": 11510 + }, + { + "epoch": 1.2641115747858556, + "grad_norm": 2.822829008102417, + "learning_rate": 1e-06, + "loss": 0.8189, + "mean_token_accuracy": 0.7424822449684143, + "num_tokens": 291266043.0, + "step": 11511 + }, + { + "epoch": 1.264221392488469, + "grad_norm": 2.82607364654541, + "learning_rate": 1e-06, + "loss": 0.7893, + "mean_token_accuracy": 0.7524263858795166, + "num_tokens": 291282237.0, + "step": 11512 + }, + { + "epoch": 1.2643312101910829, + "grad_norm": 1.9452208280563354, + "learning_rate": 1e-06, + "loss": 1.0237, + "mean_token_accuracy": 0.690875768661499, + "num_tokens": 291317160.0, + "step": 11513 + }, + { + "epoch": 1.2644410278936964, + "grad_norm": 2.3982486724853516, + "learning_rate": 1e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.7232640981674194, + "num_tokens": 291341866.0, + "step": 11514 + }, + { + "epoch": 1.2645508455963101, + "grad_norm": 2.8434059619903564, + "learning_rate": 1e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.7286951541900635, + "num_tokens": 291359397.0, + "step": 11515 + }, + { + "epoch": 1.264660663298924, + "grad_norm": 2.4274094104766846, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7090560793876648, + "num_tokens": 291383687.0, + "step": 11516 + }, + { + "epoch": 1.2647704810015374, + "grad_norm": 2.076312303543091, + "learning_rate": 1e-06, + "loss": 0.8612, + "mean_token_accuracy": 0.7331643104553223, + "num_tokens": 291410682.0, + "step": 11517 + }, + { + "epoch": 1.264880298704151, + "grad_norm": 2.388017416000366, + "learning_rate": 1e-06, + "loss": 0.9523, + "mean_token_accuracy": 0.7008001208305359, + "num_tokens": 291433986.0, + "step": 11518 + }, + { + "epoch": 1.2649901164067647, + "grad_norm": 2.321847677230835, + "learning_rate": 1e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.7219396829605103, + "num_tokens": 291458792.0, + "step": 11519 + }, + { + "epoch": 1.2650999341093785, + "grad_norm": 2.3428092002868652, + "learning_rate": 1e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.6952080726623535, + "num_tokens": 291484515.0, + "step": 11520 + }, + { + "epoch": 1.265209751811992, + "grad_norm": 2.4994564056396484, + "learning_rate": 1e-06, + "loss": 0.851, + "mean_token_accuracy": 0.7340059280395508, + "num_tokens": 291505889.0, + "step": 11521 + }, + { + "epoch": 1.2653195695146058, + "grad_norm": 2.212731122970581, + "learning_rate": 1e-06, + "loss": 0.9808, + "mean_token_accuracy": 0.7088872194290161, + "num_tokens": 291531570.0, + "step": 11522 + }, + { + "epoch": 1.2654293872172193, + "grad_norm": 1.984274983406067, + "learning_rate": 1e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.7145571708679199, + "num_tokens": 291563795.0, + "step": 11523 + }, + { + "epoch": 1.265539204919833, + "grad_norm": 2.4224870204925537, + "learning_rate": 1e-06, + "loss": 0.8187, + "mean_token_accuracy": 0.7376993894577026, + "num_tokens": 291583742.0, + "step": 11524 + }, + { + "epoch": 1.2656490226224468, + "grad_norm": 2.2104239463806152, + "learning_rate": 1e-06, + "loss": 0.929, + "mean_token_accuracy": 0.7110565304756165, + "num_tokens": 291611005.0, + "step": 11525 + }, + { + "epoch": 1.2657588403250604, + "grad_norm": 2.29443621635437, + "learning_rate": 1e-06, + "loss": 0.8274, + "mean_token_accuracy": 0.7370162010192871, + "num_tokens": 291634140.0, + "step": 11526 + }, + { + "epoch": 1.2658686580276741, + "grad_norm": 2.247190475463867, + "learning_rate": 1e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.692274808883667, + "num_tokens": 291660600.0, + "step": 11527 + }, + { + "epoch": 1.2659784757302877, + "grad_norm": 2.4716057777404785, + "learning_rate": 1e-06, + "loss": 0.8266, + "mean_token_accuracy": 0.7375234961509705, + "num_tokens": 291681737.0, + "step": 11528 + }, + { + "epoch": 1.2660882934329014, + "grad_norm": 2.594916582107544, + "learning_rate": 1e-06, + "loss": 0.7662, + "mean_token_accuracy": 0.7522825002670288, + "num_tokens": 291700108.0, + "step": 11529 + }, + { + "epoch": 1.2661981111355152, + "grad_norm": 2.427349090576172, + "learning_rate": 1e-06, + "loss": 0.8219, + "mean_token_accuracy": 0.7427581548690796, + "num_tokens": 291723050.0, + "step": 11530 + }, + { + "epoch": 1.2663079288381287, + "grad_norm": 2.4999735355377197, + "learning_rate": 1e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.7188109159469604, + "num_tokens": 291745403.0, + "step": 11531 + }, + { + "epoch": 1.2664177465407422, + "grad_norm": 1.9937502145767212, + "learning_rate": 1e-06, + "loss": 0.9179, + "mean_token_accuracy": 0.7123473882675171, + "num_tokens": 291775641.0, + "step": 11532 + }, + { + "epoch": 1.266527564243356, + "grad_norm": 2.3330399990081787, + "learning_rate": 1e-06, + "loss": 0.9012, + "mean_token_accuracy": 0.7236356735229492, + "num_tokens": 291800886.0, + "step": 11533 + }, + { + "epoch": 1.2666373819459698, + "grad_norm": 2.213895082473755, + "learning_rate": 1e-06, + "loss": 0.846, + "mean_token_accuracy": 0.7326885461807251, + "num_tokens": 291826185.0, + "step": 11534 + }, + { + "epoch": 1.2667471996485833, + "grad_norm": 2.1566359996795654, + "learning_rate": 1e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.7128377556800842, + "num_tokens": 291853863.0, + "step": 11535 + }, + { + "epoch": 1.266857017351197, + "grad_norm": 2.1442673206329346, + "learning_rate": 1e-06, + "loss": 0.8246, + "mean_token_accuracy": 0.7375606894493103, + "num_tokens": 291884850.0, + "step": 11536 + }, + { + "epoch": 1.2669668350538106, + "grad_norm": 2.091158390045166, + "learning_rate": 1e-06, + "loss": 0.9891, + "mean_token_accuracy": 0.7008006572723389, + "num_tokens": 291914892.0, + "step": 11537 + }, + { + "epoch": 1.2670766527564243, + "grad_norm": 2.003248929977417, + "learning_rate": 1e-06, + "loss": 0.9036, + "mean_token_accuracy": 0.7184834480285645, + "num_tokens": 291946840.0, + "step": 11538 + }, + { + "epoch": 1.267186470459038, + "grad_norm": 2.248652458190918, + "learning_rate": 1e-06, + "loss": 0.928, + "mean_token_accuracy": 0.713273286819458, + "num_tokens": 291971500.0, + "step": 11539 + }, + { + "epoch": 1.2672962881616516, + "grad_norm": 2.3903496265411377, + "learning_rate": 1e-06, + "loss": 0.8509, + "mean_token_accuracy": 0.7311415672302246, + "num_tokens": 291993450.0, + "step": 11540 + }, + { + "epoch": 1.2674061058642654, + "grad_norm": 2.3751752376556396, + "learning_rate": 1e-06, + "loss": 0.8784, + "mean_token_accuracy": 0.7242268323898315, + "num_tokens": 292018479.0, + "step": 11541 + }, + { + "epoch": 1.267515923566879, + "grad_norm": 2.500558853149414, + "learning_rate": 1e-06, + "loss": 0.8281, + "mean_token_accuracy": 0.7363142967224121, + "num_tokens": 292039920.0, + "step": 11542 + }, + { + "epoch": 1.2676257412694927, + "grad_norm": 2.179424524307251, + "learning_rate": 1e-06, + "loss": 0.8736, + "mean_token_accuracy": 0.7254378199577332, + "num_tokens": 292066285.0, + "step": 11543 + }, + { + "epoch": 1.2677355589721064, + "grad_norm": 2.2591898441314697, + "learning_rate": 1e-06, + "loss": 0.8737, + "mean_token_accuracy": 0.7260809540748596, + "num_tokens": 292091359.0, + "step": 11544 + }, + { + "epoch": 1.26784537667472, + "grad_norm": 2.1289126873016357, + "learning_rate": 1e-06, + "loss": 0.8781, + "mean_token_accuracy": 0.7219923734664917, + "num_tokens": 292117400.0, + "step": 11545 + }, + { + "epoch": 1.2679551943773335, + "grad_norm": 2.1024973392486572, + "learning_rate": 1e-06, + "loss": 0.9246, + "mean_token_accuracy": 0.7109472155570984, + "num_tokens": 292146874.0, + "step": 11546 + }, + { + "epoch": 1.2680650120799473, + "grad_norm": 2.13838791847229, + "learning_rate": 1e-06, + "loss": 0.9316, + "mean_token_accuracy": 0.7123774886131287, + "num_tokens": 292175108.0, + "step": 11547 + }, + { + "epoch": 1.268174829782561, + "grad_norm": 2.0657217502593994, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7056185007095337, + "num_tokens": 292208799.0, + "step": 11548 + }, + { + "epoch": 1.2682846474851746, + "grad_norm": 2.2237625122070312, + "learning_rate": 1e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.7202155590057373, + "num_tokens": 292233823.0, + "step": 11549 + }, + { + "epoch": 1.2683944651877883, + "grad_norm": 2.3274455070495605, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.7065119743347168, + "num_tokens": 292260330.0, + "step": 11550 + }, + { + "epoch": 1.2685042828904018, + "grad_norm": 2.1956558227539062, + "learning_rate": 1e-06, + "loss": 0.8583, + "mean_token_accuracy": 0.7313999533653259, + "num_tokens": 292286117.0, + "step": 11551 + }, + { + "epoch": 1.2686141005930156, + "grad_norm": 2.3755476474761963, + "learning_rate": 1e-06, + "loss": 0.7214, + "mean_token_accuracy": 0.7651292085647583, + "num_tokens": 292307113.0, + "step": 11552 + }, + { + "epoch": 1.2687239182956294, + "grad_norm": 2.324319839477539, + "learning_rate": 1e-06, + "loss": 0.826, + "mean_token_accuracy": 0.7394286394119263, + "num_tokens": 292329136.0, + "step": 11553 + }, + { + "epoch": 1.268833735998243, + "grad_norm": 2.2297849655151367, + "learning_rate": 1e-06, + "loss": 0.8745, + "mean_token_accuracy": 0.7271199226379395, + "num_tokens": 292355248.0, + "step": 11554 + }, + { + "epoch": 1.2689435537008567, + "grad_norm": 2.502291202545166, + "learning_rate": 1e-06, + "loss": 0.9246, + "mean_token_accuracy": 0.7111024260520935, + "num_tokens": 292377594.0, + "step": 11555 + }, + { + "epoch": 1.2690533714034702, + "grad_norm": 2.096656084060669, + "learning_rate": 1e-06, + "loss": 0.9013, + "mean_token_accuracy": 0.7161962985992432, + "num_tokens": 292404768.0, + "step": 11556 + }, + { + "epoch": 1.269163189106084, + "grad_norm": 2.2696595191955566, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7094229459762573, + "num_tokens": 292430725.0, + "step": 11557 + }, + { + "epoch": 1.2692730068086975, + "grad_norm": 2.4693543910980225, + "learning_rate": 1e-06, + "loss": 0.8796, + "mean_token_accuracy": 0.7283686399459839, + "num_tokens": 292451776.0, + "step": 11558 + }, + { + "epoch": 1.2693828245113112, + "grad_norm": 2.3501038551330566, + "learning_rate": 1e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.7158524990081787, + "num_tokens": 292478324.0, + "step": 11559 + }, + { + "epoch": 1.2694926422139248, + "grad_norm": 2.239642858505249, + "learning_rate": 1e-06, + "loss": 0.8388, + "mean_token_accuracy": 0.7413347363471985, + "num_tokens": 292503506.0, + "step": 11560 + }, + { + "epoch": 1.2696024599165385, + "grad_norm": 2.3966453075408936, + "learning_rate": 1e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.7134018540382385, + "num_tokens": 292525535.0, + "step": 11561 + }, + { + "epoch": 1.2697122776191523, + "grad_norm": 2.272183418273926, + "learning_rate": 1e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.7144487500190735, + "num_tokens": 292550661.0, + "step": 11562 + }, + { + "epoch": 1.2698220953217658, + "grad_norm": 2.252135753631592, + "learning_rate": 1e-06, + "loss": 0.8621, + "mean_token_accuracy": 0.734230637550354, + "num_tokens": 292577604.0, + "step": 11563 + }, + { + "epoch": 1.2699319130243796, + "grad_norm": 2.221418857574463, + "learning_rate": 1e-06, + "loss": 0.8736, + "mean_token_accuracy": 0.7374023199081421, + "num_tokens": 292602912.0, + "step": 11564 + }, + { + "epoch": 1.2700417307269931, + "grad_norm": 2.3681609630584717, + "learning_rate": 1e-06, + "loss": 0.9047, + "mean_token_accuracy": 0.7293890714645386, + "num_tokens": 292624918.0, + "step": 11565 + }, + { + "epoch": 1.2701515484296069, + "grad_norm": 2.221317768096924, + "learning_rate": 1e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.7167347073554993, + "num_tokens": 292651960.0, + "step": 11566 + }, + { + "epoch": 1.2702613661322206, + "grad_norm": 2.484387159347534, + "learning_rate": 1e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.7236168384552002, + "num_tokens": 292675137.0, + "step": 11567 + }, + { + "epoch": 1.2703711838348342, + "grad_norm": 2.549283742904663, + "learning_rate": 1e-06, + "loss": 0.8464, + "mean_token_accuracy": 0.7372132539749146, + "num_tokens": 292694931.0, + "step": 11568 + }, + { + "epoch": 1.270481001537448, + "grad_norm": 2.3749523162841797, + "learning_rate": 1e-06, + "loss": 0.9037, + "mean_token_accuracy": 0.716826319694519, + "num_tokens": 292717276.0, + "step": 11569 + }, + { + "epoch": 1.2705908192400615, + "grad_norm": 2.0988941192626953, + "learning_rate": 1e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.707268238067627, + "num_tokens": 292745175.0, + "step": 11570 + }, + { + "epoch": 1.2707006369426752, + "grad_norm": 2.1164627075195312, + "learning_rate": 1e-06, + "loss": 0.8357, + "mean_token_accuracy": 0.7367295026779175, + "num_tokens": 292771565.0, + "step": 11571 + }, + { + "epoch": 1.2708104546452887, + "grad_norm": 2.5640876293182373, + "learning_rate": 1e-06, + "loss": 0.8841, + "mean_token_accuracy": 0.7253833413124084, + "num_tokens": 292791949.0, + "step": 11572 + }, + { + "epoch": 1.2709202723479025, + "grad_norm": 2.2737865447998047, + "learning_rate": 1e-06, + "loss": 0.8481, + "mean_token_accuracy": 0.7371531128883362, + "num_tokens": 292817119.0, + "step": 11573 + }, + { + "epoch": 1.271030090050516, + "grad_norm": 2.368901014328003, + "learning_rate": 1e-06, + "loss": 0.8799, + "mean_token_accuracy": 0.7275952100753784, + "num_tokens": 292840327.0, + "step": 11574 + }, + { + "epoch": 1.2711399077531298, + "grad_norm": 2.398031711578369, + "learning_rate": 1e-06, + "loss": 0.9138, + "mean_token_accuracy": 0.7139806151390076, + "num_tokens": 292862659.0, + "step": 11575 + }, + { + "epoch": 1.2712497254557436, + "grad_norm": 1.9704724550247192, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.6996280550956726, + "num_tokens": 292892942.0, + "step": 11576 + }, + { + "epoch": 1.271359543158357, + "grad_norm": 2.331975221633911, + "learning_rate": 1e-06, + "loss": 0.9256, + "mean_token_accuracy": 0.720618486404419, + "num_tokens": 292918539.0, + "step": 11577 + }, + { + "epoch": 1.2714693608609708, + "grad_norm": 2.1045100688934326, + "learning_rate": 1e-06, + "loss": 0.8404, + "mean_token_accuracy": 0.7421786785125732, + "num_tokens": 292945372.0, + "step": 11578 + }, + { + "epoch": 1.2715791785635844, + "grad_norm": 2.3885228633880615, + "learning_rate": 1e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.7209770679473877, + "num_tokens": 292968756.0, + "step": 11579 + }, + { + "epoch": 1.2716889962661981, + "grad_norm": 2.250093460083008, + "learning_rate": 1e-06, + "loss": 0.8157, + "mean_token_accuracy": 0.7369349598884583, + "num_tokens": 292993568.0, + "step": 11580 + }, + { + "epoch": 1.271798813968812, + "grad_norm": 2.3093671798706055, + "learning_rate": 1e-06, + "loss": 0.8273, + "mean_token_accuracy": 0.7413520812988281, + "num_tokens": 293016894.0, + "step": 11581 + }, + { + "epoch": 1.2719086316714254, + "grad_norm": 2.0979366302490234, + "learning_rate": 1e-06, + "loss": 0.8883, + "mean_token_accuracy": 0.7236082553863525, + "num_tokens": 293044386.0, + "step": 11582 + }, + { + "epoch": 1.272018449374039, + "grad_norm": 2.1870009899139404, + "learning_rate": 1e-06, + "loss": 0.8738, + "mean_token_accuracy": 0.7279778718948364, + "num_tokens": 293072474.0, + "step": 11583 + }, + { + "epoch": 1.2721282670766527, + "grad_norm": 2.2614784240722656, + "learning_rate": 1e-06, + "loss": 0.8562, + "mean_token_accuracy": 0.7422673106193542, + "num_tokens": 293097450.0, + "step": 11584 + }, + { + "epoch": 1.2722380847792665, + "grad_norm": 2.0034804344177246, + "learning_rate": 1e-06, + "loss": 0.9449, + "mean_token_accuracy": 0.718254566192627, + "num_tokens": 293126403.0, + "step": 11585 + }, + { + "epoch": 1.27234790248188, + "grad_norm": 2.098318338394165, + "learning_rate": 1e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.7185980081558228, + "num_tokens": 293157298.0, + "step": 11586 + }, + { + "epoch": 1.2724577201844938, + "grad_norm": 2.256808042526245, + "learning_rate": 1e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.7136931419372559, + "num_tokens": 293185407.0, + "step": 11587 + }, + { + "epoch": 1.2725675378871073, + "grad_norm": 2.179276943206787, + "learning_rate": 1e-06, + "loss": 1.0329, + "mean_token_accuracy": 0.6878451704978943, + "num_tokens": 293211760.0, + "step": 11588 + }, + { + "epoch": 1.272677355589721, + "grad_norm": 2.3113415241241455, + "learning_rate": 1e-06, + "loss": 0.9352, + "mean_token_accuracy": 0.7135469913482666, + "num_tokens": 293237058.0, + "step": 11589 + }, + { + "epoch": 1.2727871732923348, + "grad_norm": 2.2286832332611084, + "learning_rate": 1e-06, + "loss": 0.8446, + "mean_token_accuracy": 0.7354650497436523, + "num_tokens": 293264208.0, + "step": 11590 + }, + { + "epoch": 1.2728969909949484, + "grad_norm": 2.24337100982666, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.708770215511322, + "num_tokens": 293292649.0, + "step": 11591 + }, + { + "epoch": 1.273006808697562, + "grad_norm": 2.1338210105895996, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7171314358711243, + "num_tokens": 293320930.0, + "step": 11592 + }, + { + "epoch": 1.2731166264001756, + "grad_norm": 2.429417133331299, + "learning_rate": 1e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.7152294516563416, + "num_tokens": 293344506.0, + "step": 11593 + }, + { + "epoch": 1.2732264441027894, + "grad_norm": 2.637263774871826, + "learning_rate": 1e-06, + "loss": 0.8892, + "mean_token_accuracy": 0.7238731384277344, + "num_tokens": 293364938.0, + "step": 11594 + }, + { + "epoch": 1.2733362618054032, + "grad_norm": 2.3078131675720215, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7126803994178772, + "num_tokens": 293388378.0, + "step": 11595 + }, + { + "epoch": 1.2734460795080167, + "grad_norm": 2.417965888977051, + "learning_rate": 1e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.7120077610015869, + "num_tokens": 293411750.0, + "step": 11596 + }, + { + "epoch": 1.2735558972106302, + "grad_norm": 2.4533259868621826, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7120813131332397, + "num_tokens": 293434064.0, + "step": 11597 + }, + { + "epoch": 1.273665714913244, + "grad_norm": 2.0611379146575928, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.7037816643714905, + "num_tokens": 293463625.0, + "step": 11598 + }, + { + "epoch": 1.2737755326158577, + "grad_norm": 3.046401023864746, + "learning_rate": 1e-06, + "loss": 0.885, + "mean_token_accuracy": 0.7212849855422974, + "num_tokens": 293478555.0, + "step": 11599 + }, + { + "epoch": 1.2738853503184713, + "grad_norm": 2.373481512069702, + "learning_rate": 1e-06, + "loss": 0.8808, + "mean_token_accuracy": 0.7190423011779785, + "num_tokens": 293502112.0, + "step": 11600 + }, + { + "epoch": 1.273995168021085, + "grad_norm": 2.3480653762817383, + "learning_rate": 1e-06, + "loss": 0.8648, + "mean_token_accuracy": 0.7240217328071594, + "num_tokens": 293525684.0, + "step": 11601 + }, + { + "epoch": 1.2741049857236986, + "grad_norm": 2.606499671936035, + "learning_rate": 1e-06, + "loss": 0.8353, + "mean_token_accuracy": 0.7362327575683594, + "num_tokens": 293544442.0, + "step": 11602 + }, + { + "epoch": 1.2742148034263123, + "grad_norm": 2.3154919147491455, + "learning_rate": 1e-06, + "loss": 0.8589, + "mean_token_accuracy": 0.7336259484291077, + "num_tokens": 293569562.0, + "step": 11603 + }, + { + "epoch": 1.274324621128926, + "grad_norm": 2.434288740158081, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.7121800184249878, + "num_tokens": 293592327.0, + "step": 11604 + }, + { + "epoch": 1.2744344388315396, + "grad_norm": 2.151444435119629, + "learning_rate": 1e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.713049054145813, + "num_tokens": 293620582.0, + "step": 11605 + }, + { + "epoch": 1.2745442565341534, + "grad_norm": 2.744553327560425, + "learning_rate": 1e-06, + "loss": 0.9026, + "mean_token_accuracy": 0.7216658592224121, + "num_tokens": 293639363.0, + "step": 11606 + }, + { + "epoch": 1.274654074236767, + "grad_norm": 2.2882118225097656, + "learning_rate": 1e-06, + "loss": 0.9237, + "mean_token_accuracy": 0.7242869734764099, + "num_tokens": 293664621.0, + "step": 11607 + }, + { + "epoch": 1.2747638919393807, + "grad_norm": 2.23874831199646, + "learning_rate": 1e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.7050660848617554, + "num_tokens": 293691152.0, + "step": 11608 + }, + { + "epoch": 1.2748737096419944, + "grad_norm": 2.202731132507324, + "learning_rate": 1e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.7212537527084351, + "num_tokens": 293717074.0, + "step": 11609 + }, + { + "epoch": 1.274983527344608, + "grad_norm": 2.1069071292877197, + "learning_rate": 1e-06, + "loss": 0.9976, + "mean_token_accuracy": 0.7097001075744629, + "num_tokens": 293746544.0, + "step": 11610 + }, + { + "epoch": 1.2750933450472215, + "grad_norm": 2.2404069900512695, + "learning_rate": 1e-06, + "loss": 0.8774, + "mean_token_accuracy": 0.7292966842651367, + "num_tokens": 293771364.0, + "step": 11611 + }, + { + "epoch": 1.2752031627498353, + "grad_norm": 1.9648678302764893, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.71687912940979, + "num_tokens": 293802452.0, + "step": 11612 + }, + { + "epoch": 1.275312980452449, + "grad_norm": 2.1659586429595947, + "learning_rate": 1e-06, + "loss": 0.9833, + "mean_token_accuracy": 0.6934958696365356, + "num_tokens": 293828248.0, + "step": 11613 + }, + { + "epoch": 1.2754227981550625, + "grad_norm": 2.327707290649414, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7047860622406006, + "num_tokens": 293852262.0, + "step": 11614 + }, + { + "epoch": 1.2755326158576763, + "grad_norm": 2.198855400085449, + "learning_rate": 1e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.7174341678619385, + "num_tokens": 293879443.0, + "step": 11615 + }, + { + "epoch": 1.2756424335602898, + "grad_norm": 2.3580610752105713, + "learning_rate": 1e-06, + "loss": 0.9802, + "mean_token_accuracy": 0.707848072052002, + "num_tokens": 293903805.0, + "step": 11616 + }, + { + "epoch": 1.2757522512629036, + "grad_norm": 2.420875310897827, + "learning_rate": 1e-06, + "loss": 0.8755, + "mean_token_accuracy": 0.7248404026031494, + "num_tokens": 293924700.0, + "step": 11617 + }, + { + "epoch": 1.2758620689655173, + "grad_norm": 2.330589771270752, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7109533548355103, + "num_tokens": 293947363.0, + "step": 11618 + }, + { + "epoch": 1.2759718866681309, + "grad_norm": 2.2360963821411133, + "learning_rate": 1e-06, + "loss": 0.9939, + "mean_token_accuracy": 0.6935099363327026, + "num_tokens": 293973671.0, + "step": 11619 + }, + { + "epoch": 1.2760817043707446, + "grad_norm": 2.764310359954834, + "learning_rate": 1e-06, + "loss": 0.7572, + "mean_token_accuracy": 0.753825306892395, + "num_tokens": 293991056.0, + "step": 11620 + }, + { + "epoch": 1.2761915220733582, + "grad_norm": 2.2028753757476807, + "learning_rate": 1e-06, + "loss": 0.7687, + "mean_token_accuracy": 0.7543171644210815, + "num_tokens": 294016600.0, + "step": 11621 + }, + { + "epoch": 1.276301339775972, + "grad_norm": 2.357858657836914, + "learning_rate": 1e-06, + "loss": 0.8762, + "mean_token_accuracy": 0.7351393103599548, + "num_tokens": 294039797.0, + "step": 11622 + }, + { + "epoch": 1.2764111574785855, + "grad_norm": 2.1383919715881348, + "learning_rate": 1e-06, + "loss": 0.8752, + "mean_token_accuracy": 0.7231451869010925, + "num_tokens": 294067899.0, + "step": 11623 + }, + { + "epoch": 1.2765209751811992, + "grad_norm": 2.281019449234009, + "learning_rate": 1e-06, + "loss": 0.8696, + "mean_token_accuracy": 0.7234978675842285, + "num_tokens": 294093405.0, + "step": 11624 + }, + { + "epoch": 1.2766307928838128, + "grad_norm": 2.574462413787842, + "learning_rate": 1e-06, + "loss": 0.8626, + "mean_token_accuracy": 0.731576144695282, + "num_tokens": 294114189.0, + "step": 11625 + }, + { + "epoch": 1.2767406105864265, + "grad_norm": 2.2091269493103027, + "learning_rate": 1e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.7099549174308777, + "num_tokens": 294142956.0, + "step": 11626 + }, + { + "epoch": 1.2768504282890403, + "grad_norm": 2.3331847190856934, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7030664682388306, + "num_tokens": 294165949.0, + "step": 11627 + }, + { + "epoch": 1.2769602459916538, + "grad_norm": 2.1352357864379883, + "learning_rate": 1e-06, + "loss": 0.9971, + "mean_token_accuracy": 0.6910731792449951, + "num_tokens": 294192781.0, + "step": 11628 + }, + { + "epoch": 1.2770700636942676, + "grad_norm": 2.2942538261413574, + "learning_rate": 1e-06, + "loss": 0.8978, + "mean_token_accuracy": 0.719339907169342, + "num_tokens": 294218577.0, + "step": 11629 + }, + { + "epoch": 1.277179881396881, + "grad_norm": 2.2949888706207275, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7152140140533447, + "num_tokens": 294243770.0, + "step": 11630 + }, + { + "epoch": 1.2772896990994949, + "grad_norm": 2.19460129737854, + "learning_rate": 1e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.7280021905899048, + "num_tokens": 294269063.0, + "step": 11631 + }, + { + "epoch": 1.2773995168021086, + "grad_norm": 2.6431374549865723, + "learning_rate": 1e-06, + "loss": 0.86, + "mean_token_accuracy": 0.7323669791221619, + "num_tokens": 294287431.0, + "step": 11632 + }, + { + "epoch": 1.2775093345047221, + "grad_norm": 2.2218985557556152, + "learning_rate": 1e-06, + "loss": 1.0363, + "mean_token_accuracy": 0.6845199465751648, + "num_tokens": 294315340.0, + "step": 11633 + }, + { + "epoch": 1.2776191522073357, + "grad_norm": 2.8694512844085693, + "learning_rate": 1e-06, + "loss": 0.8074, + "mean_token_accuracy": 0.7440445423126221, + "num_tokens": 294331770.0, + "step": 11634 + }, + { + "epoch": 1.2777289699099494, + "grad_norm": 2.176907777786255, + "learning_rate": 1e-06, + "loss": 0.8704, + "mean_token_accuracy": 0.7231782674789429, + "num_tokens": 294356954.0, + "step": 11635 + }, + { + "epoch": 1.2778387876125632, + "grad_norm": 2.172231912612915, + "learning_rate": 1e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7138508558273315, + "num_tokens": 294383666.0, + "step": 11636 + }, + { + "epoch": 1.2779486053151767, + "grad_norm": 2.3488287925720215, + "learning_rate": 1e-06, + "loss": 0.8567, + "mean_token_accuracy": 0.7203744649887085, + "num_tokens": 294406098.0, + "step": 11637 + }, + { + "epoch": 1.2780584230177905, + "grad_norm": 2.252307176589966, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.6976560354232788, + "num_tokens": 294433159.0, + "step": 11638 + }, + { + "epoch": 1.278168240720404, + "grad_norm": 2.211599111557007, + "learning_rate": 1e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7204811573028564, + "num_tokens": 294459139.0, + "step": 11639 + }, + { + "epoch": 1.2782780584230178, + "grad_norm": 2.104627847671509, + "learning_rate": 1e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.7140787243843079, + "num_tokens": 294485159.0, + "step": 11640 + }, + { + "epoch": 1.2783878761256315, + "grad_norm": 1.9910351037979126, + "learning_rate": 1e-06, + "loss": 0.904, + "mean_token_accuracy": 0.7231047749519348, + "num_tokens": 294515776.0, + "step": 11641 + }, + { + "epoch": 1.278497693828245, + "grad_norm": 2.260002851486206, + "learning_rate": 1e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.7129805088043213, + "num_tokens": 294542029.0, + "step": 11642 + }, + { + "epoch": 1.2786075115308588, + "grad_norm": 2.1965742111206055, + "learning_rate": 1e-06, + "loss": 0.9174, + "mean_token_accuracy": 0.7196559906005859, + "num_tokens": 294567545.0, + "step": 11643 + }, + { + "epoch": 1.2787173292334724, + "grad_norm": 2.307497262954712, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7056357264518738, + "num_tokens": 294592852.0, + "step": 11644 + }, + { + "epoch": 1.2788271469360861, + "grad_norm": 2.1943962574005127, + "learning_rate": 1e-06, + "loss": 0.8606, + "mean_token_accuracy": 0.7354847192764282, + "num_tokens": 294617474.0, + "step": 11645 + }, + { + "epoch": 1.2789369646386999, + "grad_norm": 2.6938273906707764, + "learning_rate": 1e-06, + "loss": 0.8321, + "mean_token_accuracy": 0.7369914054870605, + "num_tokens": 294636525.0, + "step": 11646 + }, + { + "epoch": 1.2790467823413134, + "grad_norm": 2.150242805480957, + "learning_rate": 1e-06, + "loss": 0.8518, + "mean_token_accuracy": 0.727258563041687, + "num_tokens": 294662845.0, + "step": 11647 + }, + { + "epoch": 1.279156600043927, + "grad_norm": 2.1632497310638428, + "learning_rate": 1e-06, + "loss": 0.8937, + "mean_token_accuracy": 0.7222644090652466, + "num_tokens": 294689638.0, + "step": 11648 + }, + { + "epoch": 1.2792664177465407, + "grad_norm": 2.751204013824463, + "learning_rate": 1e-06, + "loss": 0.8481, + "mean_token_accuracy": 0.7279014587402344, + "num_tokens": 294707845.0, + "step": 11649 + }, + { + "epoch": 1.2793762354491545, + "grad_norm": 2.347573757171631, + "learning_rate": 1e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.7172200083732605, + "num_tokens": 294732425.0, + "step": 11650 + }, + { + "epoch": 1.279486053151768, + "grad_norm": 2.3624744415283203, + "learning_rate": 1e-06, + "loss": 0.8647, + "mean_token_accuracy": 0.7253286838531494, + "num_tokens": 294755500.0, + "step": 11651 + }, + { + "epoch": 1.2795958708543818, + "grad_norm": 2.3620519638061523, + "learning_rate": 1e-06, + "loss": 0.8523, + "mean_token_accuracy": 0.7292436361312866, + "num_tokens": 294782045.0, + "step": 11652 + }, + { + "epoch": 1.2797056885569953, + "grad_norm": 2.536864995956421, + "learning_rate": 1e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.7126133441925049, + "num_tokens": 294804631.0, + "step": 11653 + }, + { + "epoch": 1.279815506259609, + "grad_norm": 2.2263360023498535, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.722245991230011, + "num_tokens": 294828472.0, + "step": 11654 + }, + { + "epoch": 1.2799253239622228, + "grad_norm": 2.2569520473480225, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.6998628377914429, + "num_tokens": 294855765.0, + "step": 11655 + }, + { + "epoch": 1.2800351416648363, + "grad_norm": 2.062822103500366, + "learning_rate": 1e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.7012560367584229, + "num_tokens": 294884338.0, + "step": 11656 + }, + { + "epoch": 1.28014495936745, + "grad_norm": 2.5791263580322266, + "learning_rate": 1e-06, + "loss": 0.8048, + "mean_token_accuracy": 0.7447766661643982, + "num_tokens": 294903947.0, + "step": 11657 + }, + { + "epoch": 1.2802547770700636, + "grad_norm": 2.1379683017730713, + "learning_rate": 1e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7087662220001221, + "num_tokens": 294932287.0, + "step": 11658 + }, + { + "epoch": 1.2803645947726774, + "grad_norm": 2.2233569622039795, + "learning_rate": 1e-06, + "loss": 0.8775, + "mean_token_accuracy": 0.7315328121185303, + "num_tokens": 294958870.0, + "step": 11659 + }, + { + "epoch": 1.2804744124752911, + "grad_norm": 2.4716784954071045, + "learning_rate": 1e-06, + "loss": 0.8896, + "mean_token_accuracy": 0.7224186658859253, + "num_tokens": 294979132.0, + "step": 11660 + }, + { + "epoch": 1.2805842301779047, + "grad_norm": 2.4781200885772705, + "learning_rate": 1e-06, + "loss": 0.9174, + "mean_token_accuracy": 0.7221173048019409, + "num_tokens": 295002086.0, + "step": 11661 + }, + { + "epoch": 1.2806940478805182, + "grad_norm": 2.484614610671997, + "learning_rate": 1e-06, + "loss": 0.8445, + "mean_token_accuracy": 0.7409306764602661, + "num_tokens": 295022508.0, + "step": 11662 + }, + { + "epoch": 1.280803865583132, + "grad_norm": 2.255293846130371, + "learning_rate": 1e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.7023553252220154, + "num_tokens": 295049021.0, + "step": 11663 + }, + { + "epoch": 1.2809136832857457, + "grad_norm": 2.1283230781555176, + "learning_rate": 1e-06, + "loss": 0.9691, + "mean_token_accuracy": 0.6973389983177185, + "num_tokens": 295077601.0, + "step": 11664 + }, + { + "epoch": 1.2810235009883593, + "grad_norm": 2.4000675678253174, + "learning_rate": 1e-06, + "loss": 1.0068, + "mean_token_accuracy": 0.6915180087089539, + "num_tokens": 295103023.0, + "step": 11665 + }, + { + "epoch": 1.281133318690973, + "grad_norm": 2.021610736846924, + "learning_rate": 1e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.7239758968353271, + "num_tokens": 295133277.0, + "step": 11666 + }, + { + "epoch": 1.2812431363935866, + "grad_norm": 2.187835693359375, + "learning_rate": 1e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7198901176452637, + "num_tokens": 295160579.0, + "step": 11667 + }, + { + "epoch": 1.2813529540962003, + "grad_norm": 2.06949782371521, + "learning_rate": 1e-06, + "loss": 0.9474, + "mean_token_accuracy": 0.7041245102882385, + "num_tokens": 295190968.0, + "step": 11668 + }, + { + "epoch": 1.281462771798814, + "grad_norm": 2.238887071609497, + "learning_rate": 1e-06, + "loss": 0.8756, + "mean_token_accuracy": 0.7219104170799255, + "num_tokens": 295216955.0, + "step": 11669 + }, + { + "epoch": 1.2815725895014276, + "grad_norm": 2.4325668811798096, + "learning_rate": 1e-06, + "loss": 0.8838, + "mean_token_accuracy": 0.738224446773529, + "num_tokens": 295240318.0, + "step": 11670 + }, + { + "epoch": 1.2816824072040414, + "grad_norm": 2.3019559383392334, + "learning_rate": 1e-06, + "loss": 0.9146, + "mean_token_accuracy": 0.7248765230178833, + "num_tokens": 295265900.0, + "step": 11671 + }, + { + "epoch": 1.281792224906655, + "grad_norm": 2.143493175506592, + "learning_rate": 1e-06, + "loss": 0.9013, + "mean_token_accuracy": 0.7211183309555054, + "num_tokens": 295292708.0, + "step": 11672 + }, + { + "epoch": 1.2819020426092687, + "grad_norm": 2.6618030071258545, + "learning_rate": 1e-06, + "loss": 0.8605, + "mean_token_accuracy": 0.741075873374939, + "num_tokens": 295311753.0, + "step": 11673 + }, + { + "epoch": 1.2820118603118824, + "grad_norm": 2.887650489807129, + "learning_rate": 1e-06, + "loss": 0.7885, + "mean_token_accuracy": 0.75375896692276, + "num_tokens": 295328771.0, + "step": 11674 + }, + { + "epoch": 1.282121678014496, + "grad_norm": 2.4346842765808105, + "learning_rate": 1e-06, + "loss": 0.8347, + "mean_token_accuracy": 0.741575300693512, + "num_tokens": 295350265.0, + "step": 11675 + }, + { + "epoch": 1.2822314957171095, + "grad_norm": 2.2846410274505615, + "learning_rate": 1e-06, + "loss": 0.7517, + "mean_token_accuracy": 0.7618560194969177, + "num_tokens": 295373716.0, + "step": 11676 + }, + { + "epoch": 1.2823413134197232, + "grad_norm": 2.328680992126465, + "learning_rate": 1e-06, + "loss": 0.8297, + "mean_token_accuracy": 0.7448347806930542, + "num_tokens": 295397153.0, + "step": 11677 + }, + { + "epoch": 1.282451131122337, + "grad_norm": 2.2715766429901123, + "learning_rate": 1e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.7030858993530273, + "num_tokens": 295426596.0, + "step": 11678 + }, + { + "epoch": 1.2825609488249505, + "grad_norm": 2.3049776554107666, + "learning_rate": 1e-06, + "loss": 0.9046, + "mean_token_accuracy": 0.7191525101661682, + "num_tokens": 295451357.0, + "step": 11679 + }, + { + "epoch": 1.2826707665275643, + "grad_norm": 2.1296727657318115, + "learning_rate": 1e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.7182883024215698, + "num_tokens": 295479292.0, + "step": 11680 + }, + { + "epoch": 1.2827805842301778, + "grad_norm": 2.5345802307128906, + "learning_rate": 1e-06, + "loss": 0.8418, + "mean_token_accuracy": 0.7350694537162781, + "num_tokens": 295500789.0, + "step": 11681 + }, + { + "epoch": 1.2828904019327916, + "grad_norm": 2.1709768772125244, + "learning_rate": 1e-06, + "loss": 0.8845, + "mean_token_accuracy": 0.7204978466033936, + "num_tokens": 295529169.0, + "step": 11682 + }, + { + "epoch": 1.2830002196354053, + "grad_norm": 2.0761404037475586, + "learning_rate": 1e-06, + "loss": 0.9365, + "mean_token_accuracy": 0.7042373418807983, + "num_tokens": 295558145.0, + "step": 11683 + }, + { + "epoch": 1.2831100373380189, + "grad_norm": 2.459925651550293, + "learning_rate": 1e-06, + "loss": 0.8544, + "mean_token_accuracy": 0.7268344163894653, + "num_tokens": 295581078.0, + "step": 11684 + }, + { + "epoch": 1.2832198550406326, + "grad_norm": 2.4972031116485596, + "learning_rate": 1e-06, + "loss": 0.8458, + "mean_token_accuracy": 0.7459227442741394, + "num_tokens": 295601404.0, + "step": 11685 + }, + { + "epoch": 1.2833296727432462, + "grad_norm": 2.394501209259033, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7146185040473938, + "num_tokens": 295626585.0, + "step": 11686 + }, + { + "epoch": 1.28343949044586, + "grad_norm": 2.402766704559326, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7105883359909058, + "num_tokens": 295650135.0, + "step": 11687 + }, + { + "epoch": 1.2835493081484735, + "grad_norm": 2.4326717853546143, + "learning_rate": 1e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.7156534194946289, + "num_tokens": 295673319.0, + "step": 11688 + }, + { + "epoch": 1.2836591258510872, + "grad_norm": 2.1678731441497803, + "learning_rate": 1e-06, + "loss": 0.9188, + "mean_token_accuracy": 0.7095052599906921, + "num_tokens": 295699204.0, + "step": 11689 + }, + { + "epoch": 1.2837689435537007, + "grad_norm": 2.422699213027954, + "learning_rate": 1e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7107580900192261, + "num_tokens": 295720885.0, + "step": 11690 + }, + { + "epoch": 1.2838787612563145, + "grad_norm": 2.1078011989593506, + "learning_rate": 1e-06, + "loss": 0.8336, + "mean_token_accuracy": 0.7448521852493286, + "num_tokens": 295748656.0, + "step": 11691 + }, + { + "epoch": 1.2839885789589283, + "grad_norm": 2.614312171936035, + "learning_rate": 1e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7336632013320923, + "num_tokens": 295768951.0, + "step": 11692 + }, + { + "epoch": 1.2840983966615418, + "grad_norm": 1.996610403060913, + "learning_rate": 1e-06, + "loss": 0.8702, + "mean_token_accuracy": 0.7297980785369873, + "num_tokens": 295796694.0, + "step": 11693 + }, + { + "epoch": 1.2842082143641556, + "grad_norm": 2.306204080581665, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.711530327796936, + "num_tokens": 295821712.0, + "step": 11694 + }, + { + "epoch": 1.284318032066769, + "grad_norm": 2.6351094245910645, + "learning_rate": 1e-06, + "loss": 0.8062, + "mean_token_accuracy": 0.7387259602546692, + "num_tokens": 295839731.0, + "step": 11695 + }, + { + "epoch": 1.2844278497693828, + "grad_norm": 2.3499398231506348, + "learning_rate": 1e-06, + "loss": 0.8559, + "mean_token_accuracy": 0.7292245030403137, + "num_tokens": 295865006.0, + "step": 11696 + }, + { + "epoch": 1.2845376674719966, + "grad_norm": 2.144134759902954, + "learning_rate": 1e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.715406060218811, + "num_tokens": 295893189.0, + "step": 11697 + }, + { + "epoch": 1.2846474851746101, + "grad_norm": 2.459354877471924, + "learning_rate": 1e-06, + "loss": 0.9046, + "mean_token_accuracy": 0.7182384729385376, + "num_tokens": 295916485.0, + "step": 11698 + }, + { + "epoch": 1.2847573028772237, + "grad_norm": 2.2645647525787354, + "learning_rate": 1e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.7116084098815918, + "num_tokens": 295940988.0, + "step": 11699 + }, + { + "epoch": 1.2848671205798374, + "grad_norm": 2.1899075508117676, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7042074203491211, + "num_tokens": 295972002.0, + "step": 11700 + }, + { + "epoch": 1.2849769382824512, + "grad_norm": 2.3965792655944824, + "learning_rate": 1e-06, + "loss": 0.8565, + "mean_token_accuracy": 0.7266139984130859, + "num_tokens": 295996176.0, + "step": 11701 + }, + { + "epoch": 1.2850867559850647, + "grad_norm": 2.22857666015625, + "learning_rate": 1e-06, + "loss": 0.9209, + "mean_token_accuracy": 0.7160056829452515, + "num_tokens": 296023239.0, + "step": 11702 + }, + { + "epoch": 1.2851965736876785, + "grad_norm": 2.0717554092407227, + "learning_rate": 1e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.7285051345825195, + "num_tokens": 296053536.0, + "step": 11703 + }, + { + "epoch": 1.285306391390292, + "grad_norm": 2.0552515983581543, + "learning_rate": 1e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.7054163217544556, + "num_tokens": 296082403.0, + "step": 11704 + }, + { + "epoch": 1.2854162090929058, + "grad_norm": 2.2851500511169434, + "learning_rate": 1e-06, + "loss": 0.8285, + "mean_token_accuracy": 0.7361931800842285, + "num_tokens": 296104771.0, + "step": 11705 + }, + { + "epoch": 1.2855260267955195, + "grad_norm": 1.880010962486267, + "learning_rate": 1e-06, + "loss": 0.8594, + "mean_token_accuracy": 0.7311672568321228, + "num_tokens": 296137568.0, + "step": 11706 + }, + { + "epoch": 1.285635844498133, + "grad_norm": 2.5066978931427, + "learning_rate": 1e-06, + "loss": 0.8495, + "mean_token_accuracy": 0.7331445813179016, + "num_tokens": 296158484.0, + "step": 11707 + }, + { + "epoch": 1.2857456622007468, + "grad_norm": 2.238487720489502, + "learning_rate": 1e-06, + "loss": 0.8649, + "mean_token_accuracy": 0.7357075214385986, + "num_tokens": 296184523.0, + "step": 11708 + }, + { + "epoch": 1.2858554799033604, + "grad_norm": 2.110651969909668, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.7258906960487366, + "num_tokens": 296211507.0, + "step": 11709 + }, + { + "epoch": 1.2859652976059741, + "grad_norm": 1.9777599573135376, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.7252213954925537, + "num_tokens": 296246323.0, + "step": 11710 + }, + { + "epoch": 1.2860751153085879, + "grad_norm": 2.31669020652771, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7197233438491821, + "num_tokens": 296272150.0, + "step": 11711 + }, + { + "epoch": 1.2861849330112014, + "grad_norm": 2.343923807144165, + "learning_rate": 1e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.714725136756897, + "num_tokens": 296296766.0, + "step": 11712 + }, + { + "epoch": 1.286294750713815, + "grad_norm": 2.370882749557495, + "learning_rate": 1e-06, + "loss": 0.9286, + "mean_token_accuracy": 0.7141850590705872, + "num_tokens": 296320547.0, + "step": 11713 + }, + { + "epoch": 1.2864045684164287, + "grad_norm": 2.249194860458374, + "learning_rate": 1e-06, + "loss": 0.8508, + "mean_token_accuracy": 0.7311630845069885, + "num_tokens": 296345700.0, + "step": 11714 + }, + { + "epoch": 1.2865143861190425, + "grad_norm": 2.114985227584839, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7026496529579163, + "num_tokens": 296374533.0, + "step": 11715 + }, + { + "epoch": 1.286624203821656, + "grad_norm": 2.506059169769287, + "learning_rate": 1e-06, + "loss": 0.7903, + "mean_token_accuracy": 0.7464492321014404, + "num_tokens": 296395475.0, + "step": 11716 + }, + { + "epoch": 1.2867340215242697, + "grad_norm": 2.3781375885009766, + "learning_rate": 1e-06, + "loss": 0.8543, + "mean_token_accuracy": 0.7298139929771423, + "num_tokens": 296416754.0, + "step": 11717 + }, + { + "epoch": 1.2868438392268833, + "grad_norm": 2.2703921794891357, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.7023404240608215, + "num_tokens": 296442149.0, + "step": 11718 + }, + { + "epoch": 1.286953656929497, + "grad_norm": 2.0445427894592285, + "learning_rate": 1e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.7073317170143127, + "num_tokens": 296471585.0, + "step": 11719 + }, + { + "epoch": 1.2870634746321108, + "grad_norm": 2.333679437637329, + "learning_rate": 1e-06, + "loss": 0.8171, + "mean_token_accuracy": 0.7407358884811401, + "num_tokens": 296495051.0, + "step": 11720 + }, + { + "epoch": 1.2871732923347243, + "grad_norm": 2.075995683670044, + "learning_rate": 1e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7176913619041443, + "num_tokens": 296524539.0, + "step": 11721 + }, + { + "epoch": 1.287283110037338, + "grad_norm": 2.3501501083374023, + "learning_rate": 1e-06, + "loss": 0.9115, + "mean_token_accuracy": 0.7138714790344238, + "num_tokens": 296548721.0, + "step": 11722 + }, + { + "epoch": 1.2873929277399516, + "grad_norm": 2.2170827388763428, + "learning_rate": 1e-06, + "loss": 0.8487, + "mean_token_accuracy": 0.744645893573761, + "num_tokens": 296575126.0, + "step": 11723 + }, + { + "epoch": 1.2875027454425654, + "grad_norm": 2.359666347503662, + "learning_rate": 1e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.709578275680542, + "num_tokens": 296598190.0, + "step": 11724 + }, + { + "epoch": 1.2876125631451791, + "grad_norm": 2.0476067066192627, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7073094844818115, + "num_tokens": 296629082.0, + "step": 11725 + }, + { + "epoch": 1.2877223808477927, + "grad_norm": 2.242558717727661, + "learning_rate": 1e-06, + "loss": 0.8867, + "mean_token_accuracy": 0.7227828502655029, + "num_tokens": 296655899.0, + "step": 11726 + }, + { + "epoch": 1.2878321985504062, + "grad_norm": 2.4924652576446533, + "learning_rate": 1e-06, + "loss": 0.8511, + "mean_token_accuracy": 0.7340878248214722, + "num_tokens": 296676822.0, + "step": 11727 + }, + { + "epoch": 1.28794201625302, + "grad_norm": 2.2304437160491943, + "learning_rate": 1e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.6989719867706299, + "num_tokens": 296705440.0, + "step": 11728 + }, + { + "epoch": 1.2880518339556337, + "grad_norm": 2.3060855865478516, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.71830153465271, + "num_tokens": 296731165.0, + "step": 11729 + }, + { + "epoch": 1.2881616516582473, + "grad_norm": 2.5350351333618164, + "learning_rate": 1e-06, + "loss": 0.9064, + "mean_token_accuracy": 0.7237027883529663, + "num_tokens": 296752357.0, + "step": 11730 + }, + { + "epoch": 1.288271469360861, + "grad_norm": 2.368037223815918, + "learning_rate": 1e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.7081736326217651, + "num_tokens": 296775690.0, + "step": 11731 + }, + { + "epoch": 1.2883812870634745, + "grad_norm": 2.122758388519287, + "learning_rate": 1e-06, + "loss": 0.9454, + "mean_token_accuracy": 0.7094432711601257, + "num_tokens": 296804148.0, + "step": 11732 + }, + { + "epoch": 1.2884911047660883, + "grad_norm": 2.25004243850708, + "learning_rate": 1e-06, + "loss": 0.9668, + "mean_token_accuracy": 0.6983702182769775, + "num_tokens": 296830969.0, + "step": 11733 + }, + { + "epoch": 1.288600922468702, + "grad_norm": 2.3454837799072266, + "learning_rate": 1e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.720937967300415, + "num_tokens": 296855289.0, + "step": 11734 + }, + { + "epoch": 1.2887107401713156, + "grad_norm": 2.3591666221618652, + "learning_rate": 1e-06, + "loss": 0.8543, + "mean_token_accuracy": 0.7343928217887878, + "num_tokens": 296879206.0, + "step": 11735 + }, + { + "epoch": 1.2888205578739294, + "grad_norm": 1.9048945903778076, + "learning_rate": 1e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.6953798532485962, + "num_tokens": 296915113.0, + "step": 11736 + }, + { + "epoch": 1.2889303755765429, + "grad_norm": 1.9421979188919067, + "learning_rate": 1e-06, + "loss": 0.8512, + "mean_token_accuracy": 0.737186074256897, + "num_tokens": 296945247.0, + "step": 11737 + }, + { + "epoch": 1.2890401932791566, + "grad_norm": 2.4882652759552, + "learning_rate": 1e-06, + "loss": 0.8207, + "mean_token_accuracy": 0.7403957843780518, + "num_tokens": 296966009.0, + "step": 11738 + }, + { + "epoch": 1.2891500109817704, + "grad_norm": 2.217979907989502, + "learning_rate": 1e-06, + "loss": 0.9082, + "mean_token_accuracy": 0.7318174839019775, + "num_tokens": 296992563.0, + "step": 11739 + }, + { + "epoch": 1.289259828684384, + "grad_norm": 2.1910855770111084, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.695875883102417, + "num_tokens": 297020495.0, + "step": 11740 + }, + { + "epoch": 1.2893696463869975, + "grad_norm": 2.0735437870025635, + "learning_rate": 1e-06, + "loss": 0.8795, + "mean_token_accuracy": 0.7254537343978882, + "num_tokens": 297052096.0, + "step": 11741 + }, + { + "epoch": 1.2894794640896112, + "grad_norm": 2.266627788543701, + "learning_rate": 1e-06, + "loss": 0.9873, + "mean_token_accuracy": 0.7037287354469299, + "num_tokens": 297078680.0, + "step": 11742 + }, + { + "epoch": 1.289589281792225, + "grad_norm": 2.020841598510742, + "learning_rate": 1e-06, + "loss": 0.8723, + "mean_token_accuracy": 0.7251458764076233, + "num_tokens": 297109344.0, + "step": 11743 + }, + { + "epoch": 1.2896990994948385, + "grad_norm": 2.4366910457611084, + "learning_rate": 1e-06, + "loss": 0.7677, + "mean_token_accuracy": 0.7531164884567261, + "num_tokens": 297132820.0, + "step": 11744 + }, + { + "epoch": 1.2898089171974523, + "grad_norm": 2.2506635189056396, + "learning_rate": 1e-06, + "loss": 0.8886, + "mean_token_accuracy": 0.7309859991073608, + "num_tokens": 297158604.0, + "step": 11745 + }, + { + "epoch": 1.2899187349000658, + "grad_norm": 2.373478651046753, + "learning_rate": 1e-06, + "loss": 0.9487, + "mean_token_accuracy": 0.7090367078781128, + "num_tokens": 297184398.0, + "step": 11746 + }, + { + "epoch": 1.2900285526026796, + "grad_norm": 2.392554521560669, + "learning_rate": 1e-06, + "loss": 0.8616, + "mean_token_accuracy": 0.7270311713218689, + "num_tokens": 297207405.0, + "step": 11747 + }, + { + "epoch": 1.2901383703052933, + "grad_norm": 2.0941412448883057, + "learning_rate": 1e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.7128897905349731, + "num_tokens": 297238851.0, + "step": 11748 + }, + { + "epoch": 1.2902481880079069, + "grad_norm": 2.40006685256958, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7139400243759155, + "num_tokens": 297261032.0, + "step": 11749 + }, + { + "epoch": 1.2903580057105206, + "grad_norm": 2.174409866333008, + "learning_rate": 1e-06, + "loss": 0.87, + "mean_token_accuracy": 0.7363801002502441, + "num_tokens": 297287371.0, + "step": 11750 + }, + { + "epoch": 1.2904678234131342, + "grad_norm": 2.2930777072906494, + "learning_rate": 1e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.7088708877563477, + "num_tokens": 297314103.0, + "step": 11751 + }, + { + "epoch": 1.290577641115748, + "grad_norm": 2.1292777061462402, + "learning_rate": 1e-06, + "loss": 0.9089, + "mean_token_accuracy": 0.7179331183433533, + "num_tokens": 297340627.0, + "step": 11752 + }, + { + "epoch": 1.2906874588183614, + "grad_norm": 2.348292350769043, + "learning_rate": 1e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.7103140354156494, + "num_tokens": 297363988.0, + "step": 11753 + }, + { + "epoch": 1.2907972765209752, + "grad_norm": 2.029642105102539, + "learning_rate": 1e-06, + "loss": 0.977, + "mean_token_accuracy": 0.7014485001564026, + "num_tokens": 297393282.0, + "step": 11754 + }, + { + "epoch": 1.2909070942235887, + "grad_norm": 2.6383204460144043, + "learning_rate": 1e-06, + "loss": 0.8266, + "mean_token_accuracy": 0.7374261617660522, + "num_tokens": 297412287.0, + "step": 11755 + }, + { + "epoch": 1.2910169119262025, + "grad_norm": 2.6100964546203613, + "learning_rate": 1e-06, + "loss": 0.8434, + "mean_token_accuracy": 0.7311902642250061, + "num_tokens": 297433576.0, + "step": 11756 + }, + { + "epoch": 1.2911267296288162, + "grad_norm": 2.2885591983795166, + "learning_rate": 1e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.713135302066803, + "num_tokens": 297460249.0, + "step": 11757 + }, + { + "epoch": 1.2912365473314298, + "grad_norm": 2.423762321472168, + "learning_rate": 1e-06, + "loss": 0.9565, + "mean_token_accuracy": 0.7003676891326904, + "num_tokens": 297482235.0, + "step": 11758 + }, + { + "epoch": 1.2913463650340435, + "grad_norm": 2.488718271255493, + "learning_rate": 1e-06, + "loss": 0.8637, + "mean_token_accuracy": 0.7236977815628052, + "num_tokens": 297502591.0, + "step": 11759 + }, + { + "epoch": 1.291456182736657, + "grad_norm": 2.250333309173584, + "learning_rate": 1e-06, + "loss": 0.885, + "mean_token_accuracy": 0.7215524911880493, + "num_tokens": 297527154.0, + "step": 11760 + }, + { + "epoch": 1.2915660004392708, + "grad_norm": 2.2322769165039062, + "learning_rate": 1e-06, + "loss": 0.7988, + "mean_token_accuracy": 0.7451176643371582, + "num_tokens": 297552106.0, + "step": 11761 + }, + { + "epoch": 1.2916758181418846, + "grad_norm": 2.591485023498535, + "learning_rate": 1e-06, + "loss": 0.8275, + "mean_token_accuracy": 0.7401203513145447, + "num_tokens": 297572760.0, + "step": 11762 + }, + { + "epoch": 1.2917856358444981, + "grad_norm": 2.456408739089966, + "learning_rate": 1e-06, + "loss": 0.8559, + "mean_token_accuracy": 0.7276504635810852, + "num_tokens": 297593813.0, + "step": 11763 + }, + { + "epoch": 1.2918954535471117, + "grad_norm": 2.2114956378936768, + "learning_rate": 1e-06, + "loss": 0.8611, + "mean_token_accuracy": 0.7263418436050415, + "num_tokens": 297620118.0, + "step": 11764 + }, + { + "epoch": 1.2920052712497254, + "grad_norm": 2.2468008995056152, + "learning_rate": 1e-06, + "loss": 0.9651, + "mean_token_accuracy": 0.7088981866836548, + "num_tokens": 297646719.0, + "step": 11765 + }, + { + "epoch": 1.2921150889523392, + "grad_norm": 2.307629108428955, + "learning_rate": 1e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.6960359811782837, + "num_tokens": 297672893.0, + "step": 11766 + }, + { + "epoch": 1.2922249066549527, + "grad_norm": 2.034649133682251, + "learning_rate": 1e-06, + "loss": 0.9673, + "mean_token_accuracy": 0.6957831382751465, + "num_tokens": 297705020.0, + "step": 11767 + }, + { + "epoch": 1.2923347243575665, + "grad_norm": 2.5788185596466064, + "learning_rate": 1e-06, + "loss": 0.8712, + "mean_token_accuracy": 0.732613205909729, + "num_tokens": 297726112.0, + "step": 11768 + }, + { + "epoch": 1.29244454206018, + "grad_norm": 2.4936373233795166, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7091814279556274, + "num_tokens": 297749062.0, + "step": 11769 + }, + { + "epoch": 1.2925543597627938, + "grad_norm": 2.5030410289764404, + "learning_rate": 1e-06, + "loss": 0.8814, + "mean_token_accuracy": 0.7307244539260864, + "num_tokens": 297770330.0, + "step": 11770 + }, + { + "epoch": 1.2926641774654075, + "grad_norm": 2.1135683059692383, + "learning_rate": 1e-06, + "loss": 0.9242, + "mean_token_accuracy": 0.7212817072868347, + "num_tokens": 297799655.0, + "step": 11771 + }, + { + "epoch": 1.292773995168021, + "grad_norm": 2.1753766536712646, + "learning_rate": 1e-06, + "loss": 0.8302, + "mean_token_accuracy": 0.7328580617904663, + "num_tokens": 297825474.0, + "step": 11772 + }, + { + "epoch": 1.2928838128706348, + "grad_norm": 2.3789217472076416, + "learning_rate": 1e-06, + "loss": 0.8402, + "mean_token_accuracy": 0.7386199235916138, + "num_tokens": 297848574.0, + "step": 11773 + }, + { + "epoch": 1.2929936305732483, + "grad_norm": 2.286900281906128, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7008295655250549, + "num_tokens": 297873755.0, + "step": 11774 + }, + { + "epoch": 1.293103448275862, + "grad_norm": 2.4873814582824707, + "learning_rate": 1e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.7052245736122131, + "num_tokens": 297897456.0, + "step": 11775 + }, + { + "epoch": 1.2932132659784759, + "grad_norm": 2.242417573928833, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7178992033004761, + "num_tokens": 297923501.0, + "step": 11776 + }, + { + "epoch": 1.2933230836810894, + "grad_norm": 2.421096086502075, + "learning_rate": 1e-06, + "loss": 0.8077, + "mean_token_accuracy": 0.7414230108261108, + "num_tokens": 297944403.0, + "step": 11777 + }, + { + "epoch": 1.293432901383703, + "grad_norm": 1.9586833715438843, + "learning_rate": 1e-06, + "loss": 0.8595, + "mean_token_accuracy": 0.7319791316986084, + "num_tokens": 297976154.0, + "step": 11778 + }, + { + "epoch": 1.2935427190863167, + "grad_norm": 2.049286127090454, + "learning_rate": 1e-06, + "loss": 0.7998, + "mean_token_accuracy": 0.7464954853057861, + "num_tokens": 298003476.0, + "step": 11779 + }, + { + "epoch": 1.2936525367889304, + "grad_norm": 2.3070874214172363, + "learning_rate": 1e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.7246689796447754, + "num_tokens": 298025314.0, + "step": 11780 + }, + { + "epoch": 1.293762354491544, + "grad_norm": 2.171888589859009, + "learning_rate": 1e-06, + "loss": 0.8578, + "mean_token_accuracy": 0.734380304813385, + "num_tokens": 298049850.0, + "step": 11781 + }, + { + "epoch": 1.2938721721941577, + "grad_norm": 2.428818702697754, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7069346904754639, + "num_tokens": 298071827.0, + "step": 11782 + }, + { + "epoch": 1.2939819898967713, + "grad_norm": 2.5036373138427734, + "learning_rate": 1e-06, + "loss": 0.7643, + "mean_token_accuracy": 0.7561962604522705, + "num_tokens": 298091047.0, + "step": 11783 + }, + { + "epoch": 1.294091807599385, + "grad_norm": 2.2738921642303467, + "learning_rate": 1e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7175525426864624, + "num_tokens": 298116580.0, + "step": 11784 + }, + { + "epoch": 1.2942016253019988, + "grad_norm": 2.38657808303833, + "learning_rate": 1e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.7020858526229858, + "num_tokens": 298140205.0, + "step": 11785 + }, + { + "epoch": 1.2943114430046123, + "grad_norm": 2.3160383701324463, + "learning_rate": 1e-06, + "loss": 0.8662, + "mean_token_accuracy": 0.7405589818954468, + "num_tokens": 298164249.0, + "step": 11786 + }, + { + "epoch": 1.294421260707226, + "grad_norm": 2.0336756706237793, + "learning_rate": 1e-06, + "loss": 0.9249, + "mean_token_accuracy": 0.7116818428039551, + "num_tokens": 298193630.0, + "step": 11787 + }, + { + "epoch": 1.2945310784098396, + "grad_norm": 2.7531113624572754, + "learning_rate": 1e-06, + "loss": 0.8267, + "mean_token_accuracy": 0.7371251583099365, + "num_tokens": 298212227.0, + "step": 11788 + }, + { + "epoch": 1.2946408961124534, + "grad_norm": 2.433387517929077, + "learning_rate": 1e-06, + "loss": 0.8652, + "mean_token_accuracy": 0.741438627243042, + "num_tokens": 298233928.0, + "step": 11789 + }, + { + "epoch": 1.2947507138150671, + "grad_norm": 2.3377840518951416, + "learning_rate": 1e-06, + "loss": 0.7553, + "mean_token_accuracy": 0.7627846002578735, + "num_tokens": 298256773.0, + "step": 11790 + }, + { + "epoch": 1.2948605315176807, + "grad_norm": 2.2391722202301025, + "learning_rate": 1e-06, + "loss": 0.8718, + "mean_token_accuracy": 0.7296959161758423, + "num_tokens": 298284083.0, + "step": 11791 + }, + { + "epoch": 1.2949703492202942, + "grad_norm": 2.436075210571289, + "learning_rate": 1e-06, + "loss": 0.8015, + "mean_token_accuracy": 0.7473173141479492, + "num_tokens": 298305468.0, + "step": 11792 + }, + { + "epoch": 1.295080166922908, + "grad_norm": 2.13323712348938, + "learning_rate": 1e-06, + "loss": 0.99, + "mean_token_accuracy": 0.6917340755462646, + "num_tokens": 298333683.0, + "step": 11793 + }, + { + "epoch": 1.2951899846255217, + "grad_norm": 2.392719030380249, + "learning_rate": 1e-06, + "loss": 0.7971, + "mean_token_accuracy": 0.7480514049530029, + "num_tokens": 298354651.0, + "step": 11794 + }, + { + "epoch": 1.2952998023281352, + "grad_norm": 2.127842903137207, + "learning_rate": 1e-06, + "loss": 0.9214, + "mean_token_accuracy": 0.7117159366607666, + "num_tokens": 298382195.0, + "step": 11795 + }, + { + "epoch": 1.295409620030749, + "grad_norm": 2.340883255004883, + "learning_rate": 1e-06, + "loss": 0.9268, + "mean_token_accuracy": 0.7053676247596741, + "num_tokens": 298406088.0, + "step": 11796 + }, + { + "epoch": 1.2955194377333625, + "grad_norm": 2.3950698375701904, + "learning_rate": 1e-06, + "loss": 0.8872, + "mean_token_accuracy": 0.7215408086776733, + "num_tokens": 298428301.0, + "step": 11797 + }, + { + "epoch": 1.2956292554359763, + "grad_norm": 2.2578647136688232, + "learning_rate": 1e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7176921963691711, + "num_tokens": 298451985.0, + "step": 11798 + }, + { + "epoch": 1.29573907313859, + "grad_norm": 2.3583455085754395, + "learning_rate": 1e-06, + "loss": 0.8318, + "mean_token_accuracy": 0.7417591214179993, + "num_tokens": 298473521.0, + "step": 11799 + }, + { + "epoch": 1.2958488908412036, + "grad_norm": 2.4422197341918945, + "learning_rate": 1e-06, + "loss": 0.7984, + "mean_token_accuracy": 0.7407577037811279, + "num_tokens": 298494169.0, + "step": 11800 + }, + { + "epoch": 1.2959587085438173, + "grad_norm": 2.6130523681640625, + "learning_rate": 1e-06, + "loss": 0.8547, + "mean_token_accuracy": 0.7339056730270386, + "num_tokens": 298513113.0, + "step": 11801 + }, + { + "epoch": 1.2960685262464309, + "grad_norm": 2.202758550643921, + "learning_rate": 1e-06, + "loss": 0.9308, + "mean_token_accuracy": 0.7111506462097168, + "num_tokens": 298540191.0, + "step": 11802 + }, + { + "epoch": 1.2961783439490446, + "grad_norm": 2.341475486755371, + "learning_rate": 1e-06, + "loss": 1.0115, + "mean_token_accuracy": 0.6913864612579346, + "num_tokens": 298563491.0, + "step": 11803 + }, + { + "epoch": 1.2962881616516582, + "grad_norm": 2.9259138107299805, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7245852947235107, + "num_tokens": 298580149.0, + "step": 11804 + }, + { + "epoch": 1.296397979354272, + "grad_norm": 2.55440092086792, + "learning_rate": 1e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.727998673915863, + "num_tokens": 298601937.0, + "step": 11805 + }, + { + "epoch": 1.2965077970568855, + "grad_norm": 2.1842994689941406, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.710662841796875, + "num_tokens": 298628075.0, + "step": 11806 + }, + { + "epoch": 1.2966176147594992, + "grad_norm": 1.9778087139129639, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7120540142059326, + "num_tokens": 298658767.0, + "step": 11807 + }, + { + "epoch": 1.296727432462113, + "grad_norm": 2.019052743911743, + "learning_rate": 1e-06, + "loss": 0.9004, + "mean_token_accuracy": 0.7151223421096802, + "num_tokens": 298690819.0, + "step": 11808 + }, + { + "epoch": 1.2968372501647265, + "grad_norm": 2.317249059677124, + "learning_rate": 1e-06, + "loss": 0.9024, + "mean_token_accuracy": 0.7227903604507446, + "num_tokens": 298714110.0, + "step": 11809 + }, + { + "epoch": 1.2969470678673403, + "grad_norm": 2.4893579483032227, + "learning_rate": 1e-06, + "loss": 0.8588, + "mean_token_accuracy": 0.7304524183273315, + "num_tokens": 298733751.0, + "step": 11810 + }, + { + "epoch": 1.2970568855699538, + "grad_norm": 2.2644004821777344, + "learning_rate": 1e-06, + "loss": 0.8054, + "mean_token_accuracy": 0.7431533932685852, + "num_tokens": 298756601.0, + "step": 11811 + }, + { + "epoch": 1.2971667032725676, + "grad_norm": 2.408092737197876, + "learning_rate": 1e-06, + "loss": 0.8575, + "mean_token_accuracy": 0.7297841310501099, + "num_tokens": 298777921.0, + "step": 11812 + }, + { + "epoch": 1.2972765209751813, + "grad_norm": 2.2141058444976807, + "learning_rate": 1e-06, + "loss": 0.891, + "mean_token_accuracy": 0.7223982810974121, + "num_tokens": 298803131.0, + "step": 11813 + }, + { + "epoch": 1.2973863386777948, + "grad_norm": 2.4327149391174316, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.701927125453949, + "num_tokens": 298827535.0, + "step": 11814 + }, + { + "epoch": 1.2974961563804086, + "grad_norm": 2.185375690460205, + "learning_rate": 1e-06, + "loss": 0.8519, + "mean_token_accuracy": 0.7357998490333557, + "num_tokens": 298853213.0, + "step": 11815 + }, + { + "epoch": 1.2976059740830221, + "grad_norm": 2.2692131996154785, + "learning_rate": 1e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.7073121070861816, + "num_tokens": 298879419.0, + "step": 11816 + }, + { + "epoch": 1.297715791785636, + "grad_norm": 2.4159727096557617, + "learning_rate": 1e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.7154653668403625, + "num_tokens": 298902165.0, + "step": 11817 + }, + { + "epoch": 1.2978256094882494, + "grad_norm": 2.432051181793213, + "learning_rate": 1e-06, + "loss": 0.9075, + "mean_token_accuracy": 0.7137389183044434, + "num_tokens": 298923790.0, + "step": 11818 + }, + { + "epoch": 1.2979354271908632, + "grad_norm": 2.21942400932312, + "learning_rate": 1e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.7096095085144043, + "num_tokens": 298950921.0, + "step": 11819 + }, + { + "epoch": 1.2980452448934767, + "grad_norm": 2.1272201538085938, + "learning_rate": 1e-06, + "loss": 0.9511, + "mean_token_accuracy": 0.7127975821495056, + "num_tokens": 298980888.0, + "step": 11820 + }, + { + "epoch": 1.2981550625960905, + "grad_norm": 2.013986825942993, + "learning_rate": 1e-06, + "loss": 0.8254, + "mean_token_accuracy": 0.7464901208877563, + "num_tokens": 299009791.0, + "step": 11821 + }, + { + "epoch": 1.2982648802987042, + "grad_norm": 2.2211873531341553, + "learning_rate": 1e-06, + "loss": 0.7719, + "mean_token_accuracy": 0.7494255900382996, + "num_tokens": 299033517.0, + "step": 11822 + }, + { + "epoch": 1.2983746980013178, + "grad_norm": 2.4866549968719482, + "learning_rate": 1e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.7249863147735596, + "num_tokens": 299057453.0, + "step": 11823 + }, + { + "epoch": 1.2984845157039315, + "grad_norm": 2.2884275913238525, + "learning_rate": 1e-06, + "loss": 0.8348, + "mean_token_accuracy": 0.7392697930335999, + "num_tokens": 299080795.0, + "step": 11824 + }, + { + "epoch": 1.298594333406545, + "grad_norm": 2.2670493125915527, + "learning_rate": 1e-06, + "loss": 0.9381, + "mean_token_accuracy": 0.7090054750442505, + "num_tokens": 299106726.0, + "step": 11825 + }, + { + "epoch": 1.2987041511091588, + "grad_norm": 2.0315308570861816, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.7110841870307922, + "num_tokens": 299136919.0, + "step": 11826 + }, + { + "epoch": 1.2988139688117726, + "grad_norm": 2.276097059249878, + "learning_rate": 1e-06, + "loss": 0.8529, + "mean_token_accuracy": 0.7306519746780396, + "num_tokens": 299162881.0, + "step": 11827 + }, + { + "epoch": 1.2989237865143861, + "grad_norm": 2.9134721755981445, + "learning_rate": 1e-06, + "loss": 0.8294, + "mean_token_accuracy": 0.7334437966346741, + "num_tokens": 299180124.0, + "step": 11828 + }, + { + "epoch": 1.2990336042169996, + "grad_norm": 2.4029197692871094, + "learning_rate": 1e-06, + "loss": 0.8716, + "mean_token_accuracy": 0.7239578366279602, + "num_tokens": 299203032.0, + "step": 11829 + }, + { + "epoch": 1.2991434219196134, + "grad_norm": 2.1719276905059814, + "learning_rate": 1e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.7010673880577087, + "num_tokens": 299233059.0, + "step": 11830 + }, + { + "epoch": 1.2992532396222272, + "grad_norm": 2.233506679534912, + "learning_rate": 1e-06, + "loss": 0.8849, + "mean_token_accuracy": 0.7315465211868286, + "num_tokens": 299259209.0, + "step": 11831 + }, + { + "epoch": 1.2993630573248407, + "grad_norm": 2.488276243209839, + "learning_rate": 1e-06, + "loss": 0.9378, + "mean_token_accuracy": 0.7085727453231812, + "num_tokens": 299280980.0, + "step": 11832 + }, + { + "epoch": 1.2994728750274545, + "grad_norm": 2.1146395206451416, + "learning_rate": 1e-06, + "loss": 0.954, + "mean_token_accuracy": 0.7051064968109131, + "num_tokens": 299311303.0, + "step": 11833 + }, + { + "epoch": 1.299582692730068, + "grad_norm": 2.2567896842956543, + "learning_rate": 1e-06, + "loss": 0.875, + "mean_token_accuracy": 0.7290366888046265, + "num_tokens": 299337229.0, + "step": 11834 + }, + { + "epoch": 1.2996925104326817, + "grad_norm": 2.3361780643463135, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.7024728059768677, + "num_tokens": 299364240.0, + "step": 11835 + }, + { + "epoch": 1.2998023281352955, + "grad_norm": 2.829169511795044, + "learning_rate": 1e-06, + "loss": 0.813, + "mean_token_accuracy": 0.7427585124969482, + "num_tokens": 299380789.0, + "step": 11836 + }, + { + "epoch": 1.299912145837909, + "grad_norm": 2.549879312515259, + "learning_rate": 1e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7132766246795654, + "num_tokens": 299401957.0, + "step": 11837 + }, + { + "epoch": 1.3000219635405228, + "grad_norm": 2.6464693546295166, + "learning_rate": 1e-06, + "loss": 0.8695, + "mean_token_accuracy": 0.7277767658233643, + "num_tokens": 299422845.0, + "step": 11838 + }, + { + "epoch": 1.3001317812431363, + "grad_norm": 2.117274522781372, + "learning_rate": 1e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.7071495652198792, + "num_tokens": 299450911.0, + "step": 11839 + }, + { + "epoch": 1.30024159894575, + "grad_norm": 2.134972095489502, + "learning_rate": 1e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7063234448432922, + "num_tokens": 299478060.0, + "step": 11840 + }, + { + "epoch": 1.3003514166483638, + "grad_norm": 2.2605011463165283, + "learning_rate": 1e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.711570680141449, + "num_tokens": 299503287.0, + "step": 11841 + }, + { + "epoch": 1.3004612343509774, + "grad_norm": 2.220296621322632, + "learning_rate": 1e-06, + "loss": 0.839, + "mean_token_accuracy": 0.7370650768280029, + "num_tokens": 299529566.0, + "step": 11842 + }, + { + "epoch": 1.300571052053591, + "grad_norm": 2.2784454822540283, + "learning_rate": 1e-06, + "loss": 0.9249, + "mean_token_accuracy": 0.7110751867294312, + "num_tokens": 299554879.0, + "step": 11843 + }, + { + "epoch": 1.3006808697562047, + "grad_norm": 2.351409912109375, + "learning_rate": 1e-06, + "loss": 0.9014, + "mean_token_accuracy": 0.7379196882247925, + "num_tokens": 299578128.0, + "step": 11844 + }, + { + "epoch": 1.3007906874588184, + "grad_norm": 2.3572838306427, + "learning_rate": 1e-06, + "loss": 0.8294, + "mean_token_accuracy": 0.7373596429824829, + "num_tokens": 299602040.0, + "step": 11845 + }, + { + "epoch": 1.300900505161432, + "grad_norm": 2.672576904296875, + "learning_rate": 1e-06, + "loss": 0.9039, + "mean_token_accuracy": 0.7250034809112549, + "num_tokens": 299622269.0, + "step": 11846 + }, + { + "epoch": 1.3010103228640457, + "grad_norm": 2.034878730773926, + "learning_rate": 1e-06, + "loss": 0.949, + "mean_token_accuracy": 0.7092247009277344, + "num_tokens": 299653175.0, + "step": 11847 + }, + { + "epoch": 1.3011201405666593, + "grad_norm": 2.2828309535980225, + "learning_rate": 1e-06, + "loss": 0.8473, + "mean_token_accuracy": 0.7329332828521729, + "num_tokens": 299677184.0, + "step": 11848 + }, + { + "epoch": 1.301229958269273, + "grad_norm": 2.4439446926116943, + "learning_rate": 1e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.7178307771682739, + "num_tokens": 299700823.0, + "step": 11849 + }, + { + "epoch": 1.3013397759718868, + "grad_norm": 2.2037200927734375, + "learning_rate": 1e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.7295205593109131, + "num_tokens": 299727920.0, + "step": 11850 + }, + { + "epoch": 1.3014495936745003, + "grad_norm": 2.1966001987457275, + "learning_rate": 1e-06, + "loss": 0.9086, + "mean_token_accuracy": 0.7209786772727966, + "num_tokens": 299753314.0, + "step": 11851 + }, + { + "epoch": 1.301559411377114, + "grad_norm": 2.424044370651245, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.7080258131027222, + "num_tokens": 299775238.0, + "step": 11852 + }, + { + "epoch": 1.3016692290797276, + "grad_norm": 2.085395097732544, + "learning_rate": 1e-06, + "loss": 0.8762, + "mean_token_accuracy": 0.7336320281028748, + "num_tokens": 299803076.0, + "step": 11853 + }, + { + "epoch": 1.3017790467823414, + "grad_norm": 2.3662266731262207, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7091116905212402, + "num_tokens": 299827348.0, + "step": 11854 + }, + { + "epoch": 1.301888864484955, + "grad_norm": 2.3458852767944336, + "learning_rate": 1e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.7284992933273315, + "num_tokens": 299850800.0, + "step": 11855 + }, + { + "epoch": 1.3019986821875686, + "grad_norm": 2.7600667476654053, + "learning_rate": 1e-06, + "loss": 0.8025, + "mean_token_accuracy": 0.7533518075942993, + "num_tokens": 299867702.0, + "step": 11856 + }, + { + "epoch": 1.3021084998901822, + "grad_norm": 2.2183141708374023, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7154387831687927, + "num_tokens": 299894265.0, + "step": 11857 + }, + { + "epoch": 1.302218317592796, + "grad_norm": 2.7472660541534424, + "learning_rate": 1e-06, + "loss": 0.8694, + "mean_token_accuracy": 0.724461555480957, + "num_tokens": 299912919.0, + "step": 11858 + }, + { + "epoch": 1.3023281352954097, + "grad_norm": 2.2352912425994873, + "learning_rate": 1e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.71731036901474, + "num_tokens": 299937890.0, + "step": 11859 + }, + { + "epoch": 1.3024379529980232, + "grad_norm": 2.497687339782715, + "learning_rate": 1e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.7250546216964722, + "num_tokens": 299959107.0, + "step": 11860 + }, + { + "epoch": 1.302547770700637, + "grad_norm": 2.6089985370635986, + "learning_rate": 1e-06, + "loss": 0.7828, + "mean_token_accuracy": 0.7501436471939087, + "num_tokens": 299977120.0, + "step": 11861 + }, + { + "epoch": 1.3026575884032505, + "grad_norm": 2.2778677940368652, + "learning_rate": 1e-06, + "loss": 0.9857, + "mean_token_accuracy": 0.688066303730011, + "num_tokens": 300004878.0, + "step": 11862 + }, + { + "epoch": 1.3027674061058643, + "grad_norm": 2.338122606277466, + "learning_rate": 1e-06, + "loss": 0.9084, + "mean_token_accuracy": 0.7202802896499634, + "num_tokens": 300029293.0, + "step": 11863 + }, + { + "epoch": 1.302877223808478, + "grad_norm": 2.248284101486206, + "learning_rate": 1e-06, + "loss": 0.9045, + "mean_token_accuracy": 0.7158955335617065, + "num_tokens": 300055439.0, + "step": 11864 + }, + { + "epoch": 1.3029870415110916, + "grad_norm": 2.3379268646240234, + "learning_rate": 1e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.7173316478729248, + "num_tokens": 300079081.0, + "step": 11865 + }, + { + "epoch": 1.3030968592137053, + "grad_norm": 2.0198051929473877, + "learning_rate": 1e-06, + "loss": 0.8226, + "mean_token_accuracy": 0.7455917000770569, + "num_tokens": 300106802.0, + "step": 11866 + }, + { + "epoch": 1.3032066769163189, + "grad_norm": 2.1583025455474854, + "learning_rate": 1e-06, + "loss": 0.8889, + "mean_token_accuracy": 0.7228580713272095, + "num_tokens": 300135355.0, + "step": 11867 + }, + { + "epoch": 1.3033164946189326, + "grad_norm": 2.3170182704925537, + "learning_rate": 1e-06, + "loss": 0.8767, + "mean_token_accuracy": 0.7310909628868103, + "num_tokens": 300157205.0, + "step": 11868 + }, + { + "epoch": 1.3034263123215462, + "grad_norm": 2.430823802947998, + "learning_rate": 1e-06, + "loss": 0.8379, + "mean_token_accuracy": 0.737865686416626, + "num_tokens": 300180296.0, + "step": 11869 + }, + { + "epoch": 1.30353613002416, + "grad_norm": 2.259855270385742, + "learning_rate": 1e-06, + "loss": 0.8502, + "mean_token_accuracy": 0.7252057790756226, + "num_tokens": 300204545.0, + "step": 11870 + }, + { + "epoch": 1.3036459477267734, + "grad_norm": 1.989490270614624, + "learning_rate": 1e-06, + "loss": 0.8964, + "mean_token_accuracy": 0.7209824919700623, + "num_tokens": 300234575.0, + "step": 11871 + }, + { + "epoch": 1.3037557654293872, + "grad_norm": 2.184671640396118, + "learning_rate": 1e-06, + "loss": 0.9077, + "mean_token_accuracy": 0.7234519720077515, + "num_tokens": 300262070.0, + "step": 11872 + }, + { + "epoch": 1.303865583132001, + "grad_norm": 2.1403017044067383, + "learning_rate": 1e-06, + "loss": 0.8827, + "mean_token_accuracy": 0.7257131338119507, + "num_tokens": 300291463.0, + "step": 11873 + }, + { + "epoch": 1.3039754008346145, + "grad_norm": 2.2043397426605225, + "learning_rate": 1e-06, + "loss": 0.8633, + "mean_token_accuracy": 0.7272059321403503, + "num_tokens": 300317290.0, + "step": 11874 + }, + { + "epoch": 1.3040852185372283, + "grad_norm": 2.1759369373321533, + "learning_rate": 1e-06, + "loss": 0.8424, + "mean_token_accuracy": 0.7332423329353333, + "num_tokens": 300343436.0, + "step": 11875 + }, + { + "epoch": 1.3041950362398418, + "grad_norm": 2.130284309387207, + "learning_rate": 1e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.7009282112121582, + "num_tokens": 300371858.0, + "step": 11876 + }, + { + "epoch": 1.3043048539424555, + "grad_norm": 2.3385469913482666, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7193889617919922, + "num_tokens": 300396645.0, + "step": 11877 + }, + { + "epoch": 1.3044146716450693, + "grad_norm": 3.3224668502807617, + "learning_rate": 1e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.7259060144424438, + "num_tokens": 300419855.0, + "step": 11878 + }, + { + "epoch": 1.3045244893476828, + "grad_norm": 2.5966837406158447, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7134304046630859, + "num_tokens": 300441220.0, + "step": 11879 + }, + { + "epoch": 1.3046343070502964, + "grad_norm": 2.1655638217926025, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7014520168304443, + "num_tokens": 300470627.0, + "step": 11880 + }, + { + "epoch": 1.3047441247529101, + "grad_norm": 2.322235584259033, + "learning_rate": 1e-06, + "loss": 0.8501, + "mean_token_accuracy": 0.7375454306602478, + "num_tokens": 300495186.0, + "step": 11881 + }, + { + "epoch": 1.3048539424555239, + "grad_norm": 2.3623201847076416, + "learning_rate": 1e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.7111036777496338, + "num_tokens": 300517980.0, + "step": 11882 + }, + { + "epoch": 1.3049637601581374, + "grad_norm": 2.477109909057617, + "learning_rate": 1e-06, + "loss": 0.8194, + "mean_token_accuracy": 0.7419602870941162, + "num_tokens": 300538761.0, + "step": 11883 + }, + { + "epoch": 1.3050735778607512, + "grad_norm": 2.4051430225372314, + "learning_rate": 1e-06, + "loss": 0.8877, + "mean_token_accuracy": 0.7219322323799133, + "num_tokens": 300562198.0, + "step": 11884 + }, + { + "epoch": 1.3051833955633647, + "grad_norm": 2.3410401344299316, + "learning_rate": 1e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.7122015357017517, + "num_tokens": 300585532.0, + "step": 11885 + }, + { + "epoch": 1.3052932132659785, + "grad_norm": 2.377248764038086, + "learning_rate": 1e-06, + "loss": 0.8422, + "mean_token_accuracy": 0.7343194484710693, + "num_tokens": 300608258.0, + "step": 11886 + }, + { + "epoch": 1.3054030309685922, + "grad_norm": 2.1490671634674072, + "learning_rate": 1e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.7008243799209595, + "num_tokens": 300637570.0, + "step": 11887 + }, + { + "epoch": 1.3055128486712058, + "grad_norm": 2.273000955581665, + "learning_rate": 1e-06, + "loss": 0.888, + "mean_token_accuracy": 0.7285105586051941, + "num_tokens": 300663664.0, + "step": 11888 + }, + { + "epoch": 1.3056226663738195, + "grad_norm": 2.0215721130371094, + "learning_rate": 1e-06, + "loss": 0.8669, + "mean_token_accuracy": 0.727735161781311, + "num_tokens": 300692731.0, + "step": 11889 + }, + { + "epoch": 1.305732484076433, + "grad_norm": 1.9655073881149292, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.7037556171417236, + "num_tokens": 300725931.0, + "step": 11890 + }, + { + "epoch": 1.3058423017790468, + "grad_norm": 2.793523073196411, + "learning_rate": 1e-06, + "loss": 0.784, + "mean_token_accuracy": 0.748939037322998, + "num_tokens": 300744164.0, + "step": 11891 + }, + { + "epoch": 1.3059521194816606, + "grad_norm": 2.140869617462158, + "learning_rate": 1e-06, + "loss": 0.7944, + "mean_token_accuracy": 0.7465827465057373, + "num_tokens": 300771284.0, + "step": 11892 + }, + { + "epoch": 1.306061937184274, + "grad_norm": 2.1837384700775146, + "learning_rate": 1e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.718132734298706, + "num_tokens": 300798903.0, + "step": 11893 + }, + { + "epoch": 1.3061717548868876, + "grad_norm": 2.144580841064453, + "learning_rate": 1e-06, + "loss": 0.8179, + "mean_token_accuracy": 0.7413982152938843, + "num_tokens": 300825633.0, + "step": 11894 + }, + { + "epoch": 1.3062815725895014, + "grad_norm": 2.1668472290039062, + "learning_rate": 1e-06, + "loss": 0.8597, + "mean_token_accuracy": 0.7278639078140259, + "num_tokens": 300852425.0, + "step": 11895 + }, + { + "epoch": 1.3063913902921152, + "grad_norm": 2.461031436920166, + "learning_rate": 1e-06, + "loss": 0.8857, + "mean_token_accuracy": 0.7238636612892151, + "num_tokens": 300872745.0, + "step": 11896 + }, + { + "epoch": 1.3065012079947287, + "grad_norm": 2.457742691040039, + "learning_rate": 1e-06, + "loss": 0.8412, + "mean_token_accuracy": 0.7380057573318481, + "num_tokens": 300894630.0, + "step": 11897 + }, + { + "epoch": 1.3066110256973424, + "grad_norm": 2.376134157180786, + "learning_rate": 1e-06, + "loss": 0.8519, + "mean_token_accuracy": 0.7345243096351624, + "num_tokens": 300920984.0, + "step": 11898 + }, + { + "epoch": 1.306720843399956, + "grad_norm": 2.456871509552002, + "learning_rate": 1e-06, + "loss": 0.9963, + "mean_token_accuracy": 0.6987302303314209, + "num_tokens": 300944983.0, + "step": 11899 + }, + { + "epoch": 1.3068306611025697, + "grad_norm": 2.1277544498443604, + "learning_rate": 1e-06, + "loss": 0.9966, + "mean_token_accuracy": 0.6945468187332153, + "num_tokens": 300973232.0, + "step": 11900 + }, + { + "epoch": 1.3069404788051835, + "grad_norm": 2.2197351455688477, + "learning_rate": 1e-06, + "loss": 0.8823, + "mean_token_accuracy": 0.7252349257469177, + "num_tokens": 300997070.0, + "step": 11901 + }, + { + "epoch": 1.307050296507797, + "grad_norm": 2.1140949726104736, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.7064335346221924, + "num_tokens": 301025407.0, + "step": 11902 + }, + { + "epoch": 1.3071601142104108, + "grad_norm": 2.309217929840088, + "learning_rate": 1e-06, + "loss": 0.8747, + "mean_token_accuracy": 0.7253885269165039, + "num_tokens": 301050306.0, + "step": 11903 + }, + { + "epoch": 1.3072699319130243, + "grad_norm": 2.3494389057159424, + "learning_rate": 1e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.7247025370597839, + "num_tokens": 301073964.0, + "step": 11904 + }, + { + "epoch": 1.307379749615638, + "grad_norm": 2.209812879562378, + "learning_rate": 1e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.7027546167373657, + "num_tokens": 301103849.0, + "step": 11905 + }, + { + "epoch": 1.3074895673182518, + "grad_norm": 2.45473051071167, + "learning_rate": 1e-06, + "loss": 0.8745, + "mean_token_accuracy": 0.7274073958396912, + "num_tokens": 301124164.0, + "step": 11906 + }, + { + "epoch": 1.3075993850208654, + "grad_norm": 2.070528507232666, + "learning_rate": 1e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.698911726474762, + "num_tokens": 301154606.0, + "step": 11907 + }, + { + "epoch": 1.307709202723479, + "grad_norm": 1.9905574321746826, + "learning_rate": 1e-06, + "loss": 0.9552, + "mean_token_accuracy": 0.7069519758224487, + "num_tokens": 301184500.0, + "step": 11908 + }, + { + "epoch": 1.3078190204260927, + "grad_norm": 2.1485514640808105, + "learning_rate": 1e-06, + "loss": 0.8996, + "mean_token_accuracy": 0.7141159176826477, + "num_tokens": 301213026.0, + "step": 11909 + }, + { + "epoch": 1.3079288381287064, + "grad_norm": 2.3459742069244385, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.709476113319397, + "num_tokens": 301236830.0, + "step": 11910 + }, + { + "epoch": 1.30803865583132, + "grad_norm": 2.4643170833587646, + "learning_rate": 1e-06, + "loss": 0.8724, + "mean_token_accuracy": 0.7278028726577759, + "num_tokens": 301259056.0, + "step": 11911 + }, + { + "epoch": 1.3081484735339337, + "grad_norm": 2.649259567260742, + "learning_rate": 1e-06, + "loss": 0.8224, + "mean_token_accuracy": 0.7386420965194702, + "num_tokens": 301278979.0, + "step": 11912 + }, + { + "epoch": 1.3082582912365472, + "grad_norm": 2.1728250980377197, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.7076855301856995, + "num_tokens": 301307614.0, + "step": 11913 + }, + { + "epoch": 1.308368108939161, + "grad_norm": 2.5250141620635986, + "learning_rate": 1e-06, + "loss": 0.7678, + "mean_token_accuracy": 0.7504012584686279, + "num_tokens": 301326654.0, + "step": 11914 + }, + { + "epoch": 1.3084779266417748, + "grad_norm": 1.9850451946258545, + "learning_rate": 1e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7102409601211548, + "num_tokens": 301357855.0, + "step": 11915 + }, + { + "epoch": 1.3085877443443883, + "grad_norm": 2.309360980987549, + "learning_rate": 1e-06, + "loss": 0.952, + "mean_token_accuracy": 0.705154538154602, + "num_tokens": 301382431.0, + "step": 11916 + }, + { + "epoch": 1.308697562047002, + "grad_norm": 2.3653037548065186, + "learning_rate": 1e-06, + "loss": 0.8202, + "mean_token_accuracy": 0.7468172907829285, + "num_tokens": 301405803.0, + "step": 11917 + }, + { + "epoch": 1.3088073797496156, + "grad_norm": 1.9575368165969849, + "learning_rate": 1e-06, + "loss": 0.9768, + "mean_token_accuracy": 0.6977817416191101, + "num_tokens": 301438225.0, + "step": 11918 + }, + { + "epoch": 1.3089171974522293, + "grad_norm": 2.2745144367218018, + "learning_rate": 1e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.7227932810783386, + "num_tokens": 301463563.0, + "step": 11919 + }, + { + "epoch": 1.309027015154843, + "grad_norm": 2.295161247253418, + "learning_rate": 1e-06, + "loss": 0.8375, + "mean_token_accuracy": 0.7379323244094849, + "num_tokens": 301488844.0, + "step": 11920 + }, + { + "epoch": 1.3091368328574566, + "grad_norm": 2.115445613861084, + "learning_rate": 1e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.707256555557251, + "num_tokens": 301516698.0, + "step": 11921 + }, + { + "epoch": 1.3092466505600702, + "grad_norm": 2.276698589324951, + "learning_rate": 1e-06, + "loss": 0.8924, + "mean_token_accuracy": 0.7229272127151489, + "num_tokens": 301542710.0, + "step": 11922 + }, + { + "epoch": 1.309356468262684, + "grad_norm": 2.5373079776763916, + "learning_rate": 1e-06, + "loss": 0.8268, + "mean_token_accuracy": 0.7400217652320862, + "num_tokens": 301563186.0, + "step": 11923 + }, + { + "epoch": 1.3094662859652977, + "grad_norm": 2.233126640319824, + "learning_rate": 1e-06, + "loss": 0.7943, + "mean_token_accuracy": 0.7473828792572021, + "num_tokens": 301588929.0, + "step": 11924 + }, + { + "epoch": 1.3095761036679112, + "grad_norm": 2.417297601699829, + "learning_rate": 1e-06, + "loss": 0.887, + "mean_token_accuracy": 0.7299239039421082, + "num_tokens": 301612630.0, + "step": 11925 + }, + { + "epoch": 1.309685921370525, + "grad_norm": 2.217716693878174, + "learning_rate": 1e-06, + "loss": 0.92, + "mean_token_accuracy": 0.7112990021705627, + "num_tokens": 301637544.0, + "step": 11926 + }, + { + "epoch": 1.3097957390731385, + "grad_norm": 2.470970392227173, + "learning_rate": 1e-06, + "loss": 0.8256, + "mean_token_accuracy": 0.7360594272613525, + "num_tokens": 301660054.0, + "step": 11927 + }, + { + "epoch": 1.3099055567757523, + "grad_norm": 2.1637141704559326, + "learning_rate": 1e-06, + "loss": 0.876, + "mean_token_accuracy": 0.7283033728599548, + "num_tokens": 301687301.0, + "step": 11928 + }, + { + "epoch": 1.310015374478366, + "grad_norm": 1.9646438360214233, + "learning_rate": 1e-06, + "loss": 1.0351, + "mean_token_accuracy": 0.6865807771682739, + "num_tokens": 301720217.0, + "step": 11929 + }, + { + "epoch": 1.3101251921809796, + "grad_norm": 2.1480677127838135, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.6990967392921448, + "num_tokens": 301746369.0, + "step": 11930 + }, + { + "epoch": 1.3102350098835933, + "grad_norm": 2.2315073013305664, + "learning_rate": 1e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.7155849933624268, + "num_tokens": 301777230.0, + "step": 11931 + }, + { + "epoch": 1.3103448275862069, + "grad_norm": 2.050625801086426, + "learning_rate": 1e-06, + "loss": 0.8786, + "mean_token_accuracy": 0.7239296436309814, + "num_tokens": 301807723.0, + "step": 11932 + }, + { + "epoch": 1.3104546452888206, + "grad_norm": 2.193089485168457, + "learning_rate": 1e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.7068449854850769, + "num_tokens": 301835041.0, + "step": 11933 + }, + { + "epoch": 1.3105644629914341, + "grad_norm": 2.0865042209625244, + "learning_rate": 1e-06, + "loss": 0.9205, + "mean_token_accuracy": 0.707874059677124, + "num_tokens": 301862924.0, + "step": 11934 + }, + { + "epoch": 1.310674280694048, + "grad_norm": 2.18847918510437, + "learning_rate": 1e-06, + "loss": 1.0451, + "mean_token_accuracy": 0.6873019933700562, + "num_tokens": 301893007.0, + "step": 11935 + }, + { + "epoch": 1.3107840983966614, + "grad_norm": 2.3545968532562256, + "learning_rate": 1e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7146991491317749, + "num_tokens": 301918102.0, + "step": 11936 + }, + { + "epoch": 1.3108939160992752, + "grad_norm": 2.327378511428833, + "learning_rate": 1e-06, + "loss": 0.985, + "mean_token_accuracy": 0.699492871761322, + "num_tokens": 301942226.0, + "step": 11937 + }, + { + "epoch": 1.311003733801889, + "grad_norm": 2.2235476970672607, + "learning_rate": 1e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.7218274474143982, + "num_tokens": 301967998.0, + "step": 11938 + }, + { + "epoch": 1.3111135515045025, + "grad_norm": 2.427234172821045, + "learning_rate": 1e-06, + "loss": 0.8518, + "mean_token_accuracy": 0.725778341293335, + "num_tokens": 301990192.0, + "step": 11939 + }, + { + "epoch": 1.3112233692071162, + "grad_norm": 2.219219207763672, + "learning_rate": 1e-06, + "loss": 0.8721, + "mean_token_accuracy": 0.7372448444366455, + "num_tokens": 302016279.0, + "step": 11940 + }, + { + "epoch": 1.3113331869097298, + "grad_norm": 2.0453598499298096, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.7009786367416382, + "num_tokens": 302045748.0, + "step": 11941 + }, + { + "epoch": 1.3114430046123435, + "grad_norm": 2.276442289352417, + "learning_rate": 1e-06, + "loss": 0.9205, + "mean_token_accuracy": 0.7085553407669067, + "num_tokens": 302071510.0, + "step": 11942 + }, + { + "epoch": 1.3115528223149573, + "grad_norm": 2.4558329582214355, + "learning_rate": 1e-06, + "loss": 0.9088, + "mean_token_accuracy": 0.7167177200317383, + "num_tokens": 302094899.0, + "step": 11943 + }, + { + "epoch": 1.3116626400175708, + "grad_norm": 2.2158126831054688, + "learning_rate": 1e-06, + "loss": 0.9786, + "mean_token_accuracy": 0.6981227397918701, + "num_tokens": 302120959.0, + "step": 11944 + }, + { + "epoch": 1.3117724577201844, + "grad_norm": 2.3540265560150146, + "learning_rate": 1e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.703956663608551, + "num_tokens": 302145758.0, + "step": 11945 + }, + { + "epoch": 1.3118822754227981, + "grad_norm": 2.249565601348877, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.6995285749435425, + "num_tokens": 302171320.0, + "step": 11946 + }, + { + "epoch": 1.3119920931254119, + "grad_norm": 2.0931200981140137, + "learning_rate": 1e-06, + "loss": 0.9029, + "mean_token_accuracy": 0.7130941152572632, + "num_tokens": 302197432.0, + "step": 11947 + }, + { + "epoch": 1.3121019108280254, + "grad_norm": 2.1085398197174072, + "learning_rate": 1e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.7054381370544434, + "num_tokens": 302225137.0, + "step": 11948 + }, + { + "epoch": 1.3122117285306392, + "grad_norm": 2.1978297233581543, + "learning_rate": 1e-06, + "loss": 1.0139, + "mean_token_accuracy": 0.6955634951591492, + "num_tokens": 302250701.0, + "step": 11949 + }, + { + "epoch": 1.3123215462332527, + "grad_norm": 2.2122535705566406, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7033491730690002, + "num_tokens": 302278199.0, + "step": 11950 + }, + { + "epoch": 1.3124313639358665, + "grad_norm": 2.2485880851745605, + "learning_rate": 1e-06, + "loss": 0.8726, + "mean_token_accuracy": 0.7272257208824158, + "num_tokens": 302302262.0, + "step": 11951 + }, + { + "epoch": 1.3125411816384802, + "grad_norm": 2.4136929512023926, + "learning_rate": 1e-06, + "loss": 0.8931, + "mean_token_accuracy": 0.7220313549041748, + "num_tokens": 302323862.0, + "step": 11952 + }, + { + "epoch": 1.3126509993410937, + "grad_norm": 2.160773277282715, + "learning_rate": 1e-06, + "loss": 0.9327, + "mean_token_accuracy": 0.7100225687026978, + "num_tokens": 302349267.0, + "step": 11953 + }, + { + "epoch": 1.3127608170437075, + "grad_norm": 2.2610902786254883, + "learning_rate": 1e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.6976232528686523, + "num_tokens": 302377205.0, + "step": 11954 + }, + { + "epoch": 1.312870634746321, + "grad_norm": 2.3608624935150146, + "learning_rate": 1e-06, + "loss": 0.9225, + "mean_token_accuracy": 0.7108582258224487, + "num_tokens": 302400749.0, + "step": 11955 + }, + { + "epoch": 1.3129804524489348, + "grad_norm": 2.671170711517334, + "learning_rate": 1e-06, + "loss": 0.8681, + "mean_token_accuracy": 0.7283844947814941, + "num_tokens": 302419721.0, + "step": 11956 + }, + { + "epoch": 1.3130902701515486, + "grad_norm": 2.367466926574707, + "learning_rate": 1e-06, + "loss": 0.852, + "mean_token_accuracy": 0.736204981803894, + "num_tokens": 302442395.0, + "step": 11957 + }, + { + "epoch": 1.313200087854162, + "grad_norm": 2.386215925216675, + "learning_rate": 1e-06, + "loss": 0.8396, + "mean_token_accuracy": 0.7312947511672974, + "num_tokens": 302462780.0, + "step": 11958 + }, + { + "epoch": 1.3133099055567756, + "grad_norm": 2.2317423820495605, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.6955686807632446, + "num_tokens": 302489810.0, + "step": 11959 + }, + { + "epoch": 1.3134197232593894, + "grad_norm": 2.5127415657043457, + "learning_rate": 1e-06, + "loss": 0.8481, + "mean_token_accuracy": 0.729682445526123, + "num_tokens": 302510376.0, + "step": 11960 + }, + { + "epoch": 1.3135295409620031, + "grad_norm": 2.0955166816711426, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.705882728099823, + "num_tokens": 302540330.0, + "step": 11961 + }, + { + "epoch": 1.3136393586646167, + "grad_norm": 2.134944200515747, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7105663418769836, + "num_tokens": 302571971.0, + "step": 11962 + }, + { + "epoch": 1.3137491763672304, + "grad_norm": 2.358011484146118, + "learning_rate": 1e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.7232095003128052, + "num_tokens": 302596465.0, + "step": 11963 + }, + { + "epoch": 1.313858994069844, + "grad_norm": 2.3311872482299805, + "learning_rate": 1e-06, + "loss": 0.8225, + "mean_token_accuracy": 0.7491253614425659, + "num_tokens": 302619517.0, + "step": 11964 + }, + { + "epoch": 1.3139688117724577, + "grad_norm": 2.342118978500366, + "learning_rate": 1e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.6991376876831055, + "num_tokens": 302645348.0, + "step": 11965 + }, + { + "epoch": 1.3140786294750715, + "grad_norm": 2.07671856880188, + "learning_rate": 1e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.703436017036438, + "num_tokens": 302673254.0, + "step": 11966 + }, + { + "epoch": 1.314188447177685, + "grad_norm": 2.2222461700439453, + "learning_rate": 1e-06, + "loss": 0.854, + "mean_token_accuracy": 0.733264684677124, + "num_tokens": 302698864.0, + "step": 11967 + }, + { + "epoch": 1.3142982648802988, + "grad_norm": 2.071390390396118, + "learning_rate": 1e-06, + "loss": 1.0445, + "mean_token_accuracy": 0.6874187588691711, + "num_tokens": 302728696.0, + "step": 11968 + }, + { + "epoch": 1.3144080825829123, + "grad_norm": 2.4594531059265137, + "learning_rate": 1e-06, + "loss": 0.8744, + "mean_token_accuracy": 0.7320645451545715, + "num_tokens": 302749943.0, + "step": 11969 + }, + { + "epoch": 1.314517900285526, + "grad_norm": 2.103883743286133, + "learning_rate": 1e-06, + "loss": 0.9279, + "mean_token_accuracy": 0.7136845588684082, + "num_tokens": 302778983.0, + "step": 11970 + }, + { + "epoch": 1.3146277179881398, + "grad_norm": 2.4318792819976807, + "learning_rate": 1e-06, + "loss": 0.9186, + "mean_token_accuracy": 0.7070877552032471, + "num_tokens": 302801681.0, + "step": 11971 + }, + { + "epoch": 1.3147375356907534, + "grad_norm": 2.5049402713775635, + "learning_rate": 1e-06, + "loss": 0.8676, + "mean_token_accuracy": 0.7269996404647827, + "num_tokens": 302823234.0, + "step": 11972 + }, + { + "epoch": 1.314847353393367, + "grad_norm": 2.1779630184173584, + "learning_rate": 1e-06, + "loss": 0.9935, + "mean_token_accuracy": 0.6952152252197266, + "num_tokens": 302851499.0, + "step": 11973 + }, + { + "epoch": 1.3149571710959806, + "grad_norm": 2.5369713306427, + "learning_rate": 1e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.714423418045044, + "num_tokens": 302872622.0, + "step": 11974 + }, + { + "epoch": 1.3150669887985944, + "grad_norm": 2.243370294570923, + "learning_rate": 1e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.7136812210083008, + "num_tokens": 302896935.0, + "step": 11975 + }, + { + "epoch": 1.315176806501208, + "grad_norm": 2.1625001430511475, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7079804539680481, + "num_tokens": 302924061.0, + "step": 11976 + }, + { + "epoch": 1.3152866242038217, + "grad_norm": 2.2372353076934814, + "learning_rate": 1e-06, + "loss": 0.8677, + "mean_token_accuracy": 0.7317937612533569, + "num_tokens": 302947882.0, + "step": 11977 + }, + { + "epoch": 1.3153964419064352, + "grad_norm": 2.013829469680786, + "learning_rate": 1e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.7025144696235657, + "num_tokens": 302979202.0, + "step": 11978 + }, + { + "epoch": 1.315506259609049, + "grad_norm": 2.528024911880493, + "learning_rate": 1e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.7138821482658386, + "num_tokens": 303000591.0, + "step": 11979 + }, + { + "epoch": 1.3156160773116627, + "grad_norm": 2.1808857917785645, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7106903791427612, + "num_tokens": 303025101.0, + "step": 11980 + }, + { + "epoch": 1.3157258950142763, + "grad_norm": 1.8774895668029785, + "learning_rate": 1e-06, + "loss": 0.8703, + "mean_token_accuracy": 0.728551983833313, + "num_tokens": 303058903.0, + "step": 11981 + }, + { + "epoch": 1.31583571271689, + "grad_norm": 2.5284228324890137, + "learning_rate": 1e-06, + "loss": 0.7955, + "mean_token_accuracy": 0.7436891198158264, + "num_tokens": 303078482.0, + "step": 11982 + }, + { + "epoch": 1.3159455304195036, + "grad_norm": 2.228853940963745, + "learning_rate": 1e-06, + "loss": 0.9004, + "mean_token_accuracy": 0.7262978553771973, + "num_tokens": 303104885.0, + "step": 11983 + }, + { + "epoch": 1.3160553481221173, + "grad_norm": 2.2878410816192627, + "learning_rate": 1e-06, + "loss": 0.8788, + "mean_token_accuracy": 0.7295146584510803, + "num_tokens": 303130893.0, + "step": 11984 + }, + { + "epoch": 1.3161651658247309, + "grad_norm": 2.0921707153320312, + "learning_rate": 1e-06, + "loss": 0.8823, + "mean_token_accuracy": 0.7254551649093628, + "num_tokens": 303158686.0, + "step": 11985 + }, + { + "epoch": 1.3162749835273446, + "grad_norm": 2.2264950275421143, + "learning_rate": 1e-06, + "loss": 0.9174, + "mean_token_accuracy": 0.7107564210891724, + "num_tokens": 303187202.0, + "step": 11986 + }, + { + "epoch": 1.3163848012299582, + "grad_norm": 2.466719150543213, + "learning_rate": 1e-06, + "loss": 0.8374, + "mean_token_accuracy": 0.7328059077262878, + "num_tokens": 303208011.0, + "step": 11987 + }, + { + "epoch": 1.316494618932572, + "grad_norm": 2.3774917125701904, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7067738175392151, + "num_tokens": 303230376.0, + "step": 11988 + }, + { + "epoch": 1.3166044366351857, + "grad_norm": 2.1560511589050293, + "learning_rate": 1e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.7418334484100342, + "num_tokens": 303258843.0, + "step": 11989 + }, + { + "epoch": 1.3167142543377992, + "grad_norm": 1.9931310415267944, + "learning_rate": 1e-06, + "loss": 0.962, + "mean_token_accuracy": 0.699065089225769, + "num_tokens": 303289744.0, + "step": 11990 + }, + { + "epoch": 1.316824072040413, + "grad_norm": 2.5760624408721924, + "learning_rate": 1e-06, + "loss": 0.8055, + "mean_token_accuracy": 0.7387685775756836, + "num_tokens": 303310686.0, + "step": 11991 + }, + { + "epoch": 1.3169338897430265, + "grad_norm": 2.2679877281188965, + "learning_rate": 1e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.7031188011169434, + "num_tokens": 303337047.0, + "step": 11992 + }, + { + "epoch": 1.3170437074456403, + "grad_norm": 2.261676073074341, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7091573476791382, + "num_tokens": 303362762.0, + "step": 11993 + }, + { + "epoch": 1.317153525148254, + "grad_norm": 2.374490976333618, + "learning_rate": 1e-06, + "loss": 0.8599, + "mean_token_accuracy": 0.7331710457801819, + "num_tokens": 303386330.0, + "step": 11994 + }, + { + "epoch": 1.3172633428508675, + "grad_norm": 2.0900936126708984, + "learning_rate": 1e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.7249792814254761, + "num_tokens": 303414368.0, + "step": 11995 + }, + { + "epoch": 1.3173731605534813, + "grad_norm": 2.4229652881622314, + "learning_rate": 1e-06, + "loss": 0.8697, + "mean_token_accuracy": 0.7238231897354126, + "num_tokens": 303436783.0, + "step": 11996 + }, + { + "epoch": 1.3174829782560948, + "grad_norm": 2.1680474281311035, + "learning_rate": 1e-06, + "loss": 0.7955, + "mean_token_accuracy": 0.7513629198074341, + "num_tokens": 303462864.0, + "step": 11997 + }, + { + "epoch": 1.3175927959587086, + "grad_norm": 2.1652278900146484, + "learning_rate": 1e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.6906936168670654, + "num_tokens": 303491511.0, + "step": 11998 + }, + { + "epoch": 1.3177026136613221, + "grad_norm": 2.1210103034973145, + "learning_rate": 1e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.7077222466468811, + "num_tokens": 303519937.0, + "step": 11999 + }, + { + "epoch": 1.3178124313639359, + "grad_norm": 2.2593319416046143, + "learning_rate": 1e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.7078837156295776, + "num_tokens": 303546398.0, + "step": 12000 + }, + { + "epoch": 1.3179222490665494, + "grad_norm": 2.6044766902923584, + "learning_rate": 1e-06, + "loss": 0.868, + "mean_token_accuracy": 0.7282319664955139, + "num_tokens": 303565539.0, + "step": 12001 + }, + { + "epoch": 1.3180320667691632, + "grad_norm": 2.2976622581481934, + "learning_rate": 1e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.7122141122817993, + "num_tokens": 303591423.0, + "step": 12002 + }, + { + "epoch": 1.318141884471777, + "grad_norm": 2.187934160232544, + "learning_rate": 1e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.7211596369743347, + "num_tokens": 303615769.0, + "step": 12003 + }, + { + "epoch": 1.3182517021743905, + "grad_norm": 2.258136510848999, + "learning_rate": 1e-06, + "loss": 0.959, + "mean_token_accuracy": 0.7014234066009521, + "num_tokens": 303641193.0, + "step": 12004 + }, + { + "epoch": 1.3183615198770042, + "grad_norm": 2.4114396572113037, + "learning_rate": 1e-06, + "loss": 0.8866, + "mean_token_accuracy": 0.7243710160255432, + "num_tokens": 303664792.0, + "step": 12005 + }, + { + "epoch": 1.3184713375796178, + "grad_norm": 2.3850085735321045, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7077803611755371, + "num_tokens": 303689783.0, + "step": 12006 + }, + { + "epoch": 1.3185811552822315, + "grad_norm": 2.1921536922454834, + "learning_rate": 1e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.710395097732544, + "num_tokens": 303718019.0, + "step": 12007 + }, + { + "epoch": 1.3186909729848453, + "grad_norm": 2.345487356185913, + "learning_rate": 1e-06, + "loss": 0.8416, + "mean_token_accuracy": 0.7335137128829956, + "num_tokens": 303742168.0, + "step": 12008 + }, + { + "epoch": 1.3188007906874588, + "grad_norm": 2.3524515628814697, + "learning_rate": 1e-06, + "loss": 0.8601, + "mean_token_accuracy": 0.7287343740463257, + "num_tokens": 303766196.0, + "step": 12009 + }, + { + "epoch": 1.3189106083900723, + "grad_norm": 2.3980610370635986, + "learning_rate": 1e-06, + "loss": 0.8795, + "mean_token_accuracy": 0.7262171506881714, + "num_tokens": 303790323.0, + "step": 12010 + }, + { + "epoch": 1.319020426092686, + "grad_norm": 2.533780097961426, + "learning_rate": 1e-06, + "loss": 0.8458, + "mean_token_accuracy": 0.7398784756660461, + "num_tokens": 303810307.0, + "step": 12011 + }, + { + "epoch": 1.3191302437952999, + "grad_norm": 2.4904730319976807, + "learning_rate": 1e-06, + "loss": 0.9261, + "mean_token_accuracy": 0.7225948572158813, + "num_tokens": 303833508.0, + "step": 12012 + }, + { + "epoch": 1.3192400614979134, + "grad_norm": 2.1021313667297363, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7166621088981628, + "num_tokens": 303860104.0, + "step": 12013 + }, + { + "epoch": 1.3193498792005272, + "grad_norm": 2.3636951446533203, + "learning_rate": 1e-06, + "loss": 0.8347, + "mean_token_accuracy": 0.7322372198104858, + "num_tokens": 303883042.0, + "step": 12014 + }, + { + "epoch": 1.3194596969031407, + "grad_norm": 2.46695876121521, + "learning_rate": 1e-06, + "loss": 0.8379, + "mean_token_accuracy": 0.735466480255127, + "num_tokens": 303904751.0, + "step": 12015 + }, + { + "epoch": 1.3195695146057544, + "grad_norm": 2.5009219646453857, + "learning_rate": 1e-06, + "loss": 0.819, + "mean_token_accuracy": 0.7366604804992676, + "num_tokens": 303926579.0, + "step": 12016 + }, + { + "epoch": 1.3196793323083682, + "grad_norm": 2.4286296367645264, + "learning_rate": 1e-06, + "loss": 0.8484, + "mean_token_accuracy": 0.7339773178100586, + "num_tokens": 303949281.0, + "step": 12017 + }, + { + "epoch": 1.3197891500109817, + "grad_norm": 2.384747266769409, + "learning_rate": 1e-06, + "loss": 0.879, + "mean_token_accuracy": 0.7251086235046387, + "num_tokens": 303972339.0, + "step": 12018 + }, + { + "epoch": 1.3198989677135955, + "grad_norm": 2.649278163909912, + "learning_rate": 1e-06, + "loss": 0.8438, + "mean_token_accuracy": 0.7361907958984375, + "num_tokens": 303992439.0, + "step": 12019 + }, + { + "epoch": 1.320008785416209, + "grad_norm": 2.0794248580932617, + "learning_rate": 1e-06, + "loss": 0.8519, + "mean_token_accuracy": 0.7385028004646301, + "num_tokens": 304020422.0, + "step": 12020 + }, + { + "epoch": 1.3201186031188228, + "grad_norm": 2.245241641998291, + "learning_rate": 1e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7242424488067627, + "num_tokens": 304045931.0, + "step": 12021 + }, + { + "epoch": 1.3202284208214365, + "grad_norm": 2.18768310546875, + "learning_rate": 1e-06, + "loss": 0.8542, + "mean_token_accuracy": 0.7339817881584167, + "num_tokens": 304072475.0, + "step": 12022 + }, + { + "epoch": 1.32033823852405, + "grad_norm": 2.3976805210113525, + "learning_rate": 1e-06, + "loss": 0.8771, + "mean_token_accuracy": 0.7347767949104309, + "num_tokens": 304094525.0, + "step": 12023 + }, + { + "epoch": 1.3204480562266636, + "grad_norm": 2.2616424560546875, + "learning_rate": 1e-06, + "loss": 0.9436, + "mean_token_accuracy": 0.7054381370544434, + "num_tokens": 304121260.0, + "step": 12024 + }, + { + "epoch": 1.3205578739292774, + "grad_norm": 2.207101583480835, + "learning_rate": 1e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.7032904624938965, + "num_tokens": 304148937.0, + "step": 12025 + }, + { + "epoch": 1.3206676916318911, + "grad_norm": 2.177865505218506, + "learning_rate": 1e-06, + "loss": 0.857, + "mean_token_accuracy": 0.7268950343132019, + "num_tokens": 304175552.0, + "step": 12026 + }, + { + "epoch": 1.3207775093345047, + "grad_norm": 2.134709596633911, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7046719789505005, + "num_tokens": 304204196.0, + "step": 12027 + }, + { + "epoch": 1.3208873270371184, + "grad_norm": 2.1809887886047363, + "learning_rate": 1e-06, + "loss": 0.8434, + "mean_token_accuracy": 0.7319361567497253, + "num_tokens": 304230642.0, + "step": 12028 + }, + { + "epoch": 1.320997144739732, + "grad_norm": 2.4704997539520264, + "learning_rate": 1e-06, + "loss": 0.8497, + "mean_token_accuracy": 0.7388968467712402, + "num_tokens": 304251827.0, + "step": 12029 + }, + { + "epoch": 1.3211069624423457, + "grad_norm": 2.4725301265716553, + "learning_rate": 1e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7081933617591858, + "num_tokens": 304274204.0, + "step": 12030 + }, + { + "epoch": 1.3212167801449595, + "grad_norm": 2.189863443374634, + "learning_rate": 1e-06, + "loss": 0.8781, + "mean_token_accuracy": 0.7208219766616821, + "num_tokens": 304300387.0, + "step": 12031 + }, + { + "epoch": 1.321326597847573, + "grad_norm": 2.5643742084503174, + "learning_rate": 1e-06, + "loss": 0.8995, + "mean_token_accuracy": 0.7237077355384827, + "num_tokens": 304320998.0, + "step": 12032 + }, + { + "epoch": 1.3214364155501868, + "grad_norm": 2.2330074310302734, + "learning_rate": 1e-06, + "loss": 0.8915, + "mean_token_accuracy": 0.736557126045227, + "num_tokens": 304345780.0, + "step": 12033 + }, + { + "epoch": 1.3215462332528003, + "grad_norm": 2.1391353607177734, + "learning_rate": 1e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.7123436331748962, + "num_tokens": 304373140.0, + "step": 12034 + }, + { + "epoch": 1.321656050955414, + "grad_norm": 2.3733420372009277, + "learning_rate": 1e-06, + "loss": 0.8436, + "mean_token_accuracy": 0.7354055643081665, + "num_tokens": 304397447.0, + "step": 12035 + }, + { + "epoch": 1.3217658686580278, + "grad_norm": 2.203294277191162, + "learning_rate": 1e-06, + "loss": 0.8942, + "mean_token_accuracy": 0.7249431610107422, + "num_tokens": 304422891.0, + "step": 12036 + }, + { + "epoch": 1.3218756863606413, + "grad_norm": 2.2910547256469727, + "learning_rate": 1e-06, + "loss": 0.9087, + "mean_token_accuracy": 0.7158150672912598, + "num_tokens": 304446960.0, + "step": 12037 + }, + { + "epoch": 1.3219855040632549, + "grad_norm": 2.273261308670044, + "learning_rate": 1e-06, + "loss": 0.8288, + "mean_token_accuracy": 0.737666130065918, + "num_tokens": 304472173.0, + "step": 12038 + }, + { + "epoch": 1.3220953217658686, + "grad_norm": 2.2309558391571045, + "learning_rate": 1e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.7132558822631836, + "num_tokens": 304499817.0, + "step": 12039 + }, + { + "epoch": 1.3222051394684824, + "grad_norm": 2.2675883769989014, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7092975378036499, + "num_tokens": 304523874.0, + "step": 12040 + }, + { + "epoch": 1.322314957171096, + "grad_norm": 2.091817617416382, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7078094482421875, + "num_tokens": 304553700.0, + "step": 12041 + }, + { + "epoch": 1.3224247748737097, + "grad_norm": 2.0545315742492676, + "learning_rate": 1e-06, + "loss": 0.9553, + "mean_token_accuracy": 0.703303337097168, + "num_tokens": 304583403.0, + "step": 12042 + }, + { + "epoch": 1.3225345925763232, + "grad_norm": 2.2944753170013428, + "learning_rate": 1e-06, + "loss": 0.8823, + "mean_token_accuracy": 0.7212032675743103, + "num_tokens": 304608207.0, + "step": 12043 + }, + { + "epoch": 1.322644410278937, + "grad_norm": 2.421941041946411, + "learning_rate": 1e-06, + "loss": 0.8502, + "mean_token_accuracy": 0.7334922552108765, + "num_tokens": 304630820.0, + "step": 12044 + }, + { + "epoch": 1.3227542279815507, + "grad_norm": 2.490394353866577, + "learning_rate": 1e-06, + "loss": 0.8368, + "mean_token_accuracy": 0.7347344756126404, + "num_tokens": 304651628.0, + "step": 12045 + }, + { + "epoch": 1.3228640456841643, + "grad_norm": 2.176604986190796, + "learning_rate": 1e-06, + "loss": 0.9079, + "mean_token_accuracy": 0.7145558595657349, + "num_tokens": 304679634.0, + "step": 12046 + }, + { + "epoch": 1.322973863386778, + "grad_norm": 2.2843799591064453, + "learning_rate": 1e-06, + "loss": 1.0082, + "mean_token_accuracy": 0.6981881260871887, + "num_tokens": 304707403.0, + "step": 12047 + }, + { + "epoch": 1.3230836810893916, + "grad_norm": 2.2532968521118164, + "learning_rate": 1e-06, + "loss": 0.8665, + "mean_token_accuracy": 0.7337697744369507, + "num_tokens": 304731839.0, + "step": 12048 + }, + { + "epoch": 1.3231934987920053, + "grad_norm": 2.2367186546325684, + "learning_rate": 1e-06, + "loss": 0.9068, + "mean_token_accuracy": 0.7192426919937134, + "num_tokens": 304759049.0, + "step": 12049 + }, + { + "epoch": 1.3233033164946189, + "grad_norm": 2.151505708694458, + "learning_rate": 1e-06, + "loss": 0.8536, + "mean_token_accuracy": 0.7322483062744141, + "num_tokens": 304785513.0, + "step": 12050 + }, + { + "epoch": 1.3234131341972326, + "grad_norm": 2.238992214202881, + "learning_rate": 1e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7251271605491638, + "num_tokens": 304812235.0, + "step": 12051 + }, + { + "epoch": 1.3235229518998461, + "grad_norm": 2.278472423553467, + "learning_rate": 1e-06, + "loss": 0.8627, + "mean_token_accuracy": 0.7321646213531494, + "num_tokens": 304837101.0, + "step": 12052 + }, + { + "epoch": 1.32363276960246, + "grad_norm": 2.090637683868408, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7224233150482178, + "num_tokens": 304866964.0, + "step": 12053 + }, + { + "epoch": 1.3237425873050737, + "grad_norm": 2.1149275302886963, + "learning_rate": 1e-06, + "loss": 0.9227, + "mean_token_accuracy": 0.7175885438919067, + "num_tokens": 304897773.0, + "step": 12054 + }, + { + "epoch": 1.3238524050076872, + "grad_norm": 1.9973336458206177, + "learning_rate": 1e-06, + "loss": 0.8877, + "mean_token_accuracy": 0.7195019125938416, + "num_tokens": 304926803.0, + "step": 12055 + }, + { + "epoch": 1.323962222710301, + "grad_norm": 2.3515093326568604, + "learning_rate": 1e-06, + "loss": 0.8636, + "mean_token_accuracy": 0.7284843325614929, + "num_tokens": 304948974.0, + "step": 12056 + }, + { + "epoch": 1.3240720404129145, + "grad_norm": 2.208188056945801, + "learning_rate": 1e-06, + "loss": 0.8789, + "mean_token_accuracy": 0.7317875623703003, + "num_tokens": 304974057.0, + "step": 12057 + }, + { + "epoch": 1.3241818581155282, + "grad_norm": 2.4845192432403564, + "learning_rate": 1e-06, + "loss": 0.8284, + "mean_token_accuracy": 0.7355465888977051, + "num_tokens": 304995977.0, + "step": 12058 + }, + { + "epoch": 1.324291675818142, + "grad_norm": 2.5257883071899414, + "learning_rate": 1e-06, + "loss": 0.895, + "mean_token_accuracy": 0.7244383692741394, + "num_tokens": 305017475.0, + "step": 12059 + }, + { + "epoch": 1.3244014935207555, + "grad_norm": 2.1324143409729004, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.725335955619812, + "num_tokens": 305044349.0, + "step": 12060 + }, + { + "epoch": 1.324511311223369, + "grad_norm": 2.336595296859741, + "learning_rate": 1e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7192367911338806, + "num_tokens": 305069606.0, + "step": 12061 + }, + { + "epoch": 1.3246211289259828, + "grad_norm": 2.5464751720428467, + "learning_rate": 1e-06, + "loss": 0.8318, + "mean_token_accuracy": 0.7301504015922546, + "num_tokens": 305090495.0, + "step": 12062 + }, + { + "epoch": 1.3247309466285966, + "grad_norm": 2.2125773429870605, + "learning_rate": 1e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.6965268850326538, + "num_tokens": 305120581.0, + "step": 12063 + }, + { + "epoch": 1.3248407643312101, + "grad_norm": 2.4736216068267822, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.7065073847770691, + "num_tokens": 305143423.0, + "step": 12064 + }, + { + "epoch": 1.3249505820338239, + "grad_norm": 1.8585609197616577, + "learning_rate": 1e-06, + "loss": 0.9708, + "mean_token_accuracy": 0.7015480995178223, + "num_tokens": 305179854.0, + "step": 12065 + }, + { + "epoch": 1.3250603997364374, + "grad_norm": 2.384016752243042, + "learning_rate": 1e-06, + "loss": 0.8885, + "mean_token_accuracy": 0.7209944128990173, + "num_tokens": 305201881.0, + "step": 12066 + }, + { + "epoch": 1.3251702174390512, + "grad_norm": 2.5347564220428467, + "learning_rate": 1e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.7195210456848145, + "num_tokens": 305222888.0, + "step": 12067 + }, + { + "epoch": 1.325280035141665, + "grad_norm": 2.2592203617095947, + "learning_rate": 1e-06, + "loss": 0.8965, + "mean_token_accuracy": 0.7223547101020813, + "num_tokens": 305248599.0, + "step": 12068 + }, + { + "epoch": 1.3253898528442785, + "grad_norm": 2.324252128601074, + "learning_rate": 1e-06, + "loss": 0.9887, + "mean_token_accuracy": 0.7038688063621521, + "num_tokens": 305274161.0, + "step": 12069 + }, + { + "epoch": 1.3254996705468922, + "grad_norm": 2.06436824798584, + "learning_rate": 1e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.7117207050323486, + "num_tokens": 305302264.0, + "step": 12070 + }, + { + "epoch": 1.3256094882495058, + "grad_norm": 2.3672304153442383, + "learning_rate": 1e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.7173612117767334, + "num_tokens": 305325024.0, + "step": 12071 + }, + { + "epoch": 1.3257193059521195, + "grad_norm": 2.198524236679077, + "learning_rate": 1e-06, + "loss": 0.9039, + "mean_token_accuracy": 0.7303229570388794, + "num_tokens": 305352011.0, + "step": 12072 + }, + { + "epoch": 1.3258291236547333, + "grad_norm": 2.065471887588501, + "learning_rate": 1e-06, + "loss": 0.9172, + "mean_token_accuracy": 0.7130652666091919, + "num_tokens": 305382320.0, + "step": 12073 + }, + { + "epoch": 1.3259389413573468, + "grad_norm": 2.465449094772339, + "learning_rate": 1e-06, + "loss": 0.7583, + "mean_token_accuracy": 0.756726861000061, + "num_tokens": 305402419.0, + "step": 12074 + }, + { + "epoch": 1.3260487590599603, + "grad_norm": 2.43426775932312, + "learning_rate": 1e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.7108550071716309, + "num_tokens": 305425180.0, + "step": 12075 + }, + { + "epoch": 1.326158576762574, + "grad_norm": 2.0583157539367676, + "learning_rate": 1e-06, + "loss": 0.8528, + "mean_token_accuracy": 0.7400538325309753, + "num_tokens": 305454121.0, + "step": 12076 + }, + { + "epoch": 1.3262683944651878, + "grad_norm": 2.163888692855835, + "learning_rate": 1e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.7067385911941528, + "num_tokens": 305483742.0, + "step": 12077 + }, + { + "epoch": 1.3263782121678014, + "grad_norm": 2.082054376602173, + "learning_rate": 1e-06, + "loss": 0.8971, + "mean_token_accuracy": 0.7272245287895203, + "num_tokens": 305512735.0, + "step": 12078 + }, + { + "epoch": 1.3264880298704151, + "grad_norm": 2.1902666091918945, + "learning_rate": 1e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.7056057453155518, + "num_tokens": 305540674.0, + "step": 12079 + }, + { + "epoch": 1.3265978475730287, + "grad_norm": 2.40919828414917, + "learning_rate": 1e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7184181809425354, + "num_tokens": 305563606.0, + "step": 12080 + }, + { + "epoch": 1.3267076652756424, + "grad_norm": 2.3247313499450684, + "learning_rate": 1e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7299470901489258, + "num_tokens": 305588464.0, + "step": 12081 + }, + { + "epoch": 1.3268174829782562, + "grad_norm": 2.22869873046875, + "learning_rate": 1e-06, + "loss": 0.8509, + "mean_token_accuracy": 0.7450094223022461, + "num_tokens": 305612870.0, + "step": 12082 + }, + { + "epoch": 1.3269273006808697, + "grad_norm": 2.403923511505127, + "learning_rate": 1e-06, + "loss": 0.8442, + "mean_token_accuracy": 0.7306437492370605, + "num_tokens": 305635654.0, + "step": 12083 + }, + { + "epoch": 1.3270371183834835, + "grad_norm": 2.4692223072052, + "learning_rate": 1e-06, + "loss": 0.9587, + "mean_token_accuracy": 0.7117161154747009, + "num_tokens": 305658722.0, + "step": 12084 + }, + { + "epoch": 1.327146936086097, + "grad_norm": 2.169297933578491, + "learning_rate": 1e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.7163740396499634, + "num_tokens": 305686301.0, + "step": 12085 + }, + { + "epoch": 1.3272567537887108, + "grad_norm": 2.2449259757995605, + "learning_rate": 1e-06, + "loss": 0.8789, + "mean_token_accuracy": 0.7290292978286743, + "num_tokens": 305710481.0, + "step": 12086 + }, + { + "epoch": 1.3273665714913245, + "grad_norm": 2.396481990814209, + "learning_rate": 1e-06, + "loss": 0.8126, + "mean_token_accuracy": 0.7468016147613525, + "num_tokens": 305731018.0, + "step": 12087 + }, + { + "epoch": 1.327476389193938, + "grad_norm": 2.3000524044036865, + "learning_rate": 1e-06, + "loss": 0.8926, + "mean_token_accuracy": 0.7209340333938599, + "num_tokens": 305757327.0, + "step": 12088 + }, + { + "epoch": 1.3275862068965516, + "grad_norm": 2.180140733718872, + "learning_rate": 1e-06, + "loss": 0.863, + "mean_token_accuracy": 0.7317870855331421, + "num_tokens": 305783679.0, + "step": 12089 + }, + { + "epoch": 1.3276960245991654, + "grad_norm": 2.3584537506103516, + "learning_rate": 1e-06, + "loss": 0.8565, + "mean_token_accuracy": 0.7314252257347107, + "num_tokens": 305806455.0, + "step": 12090 + }, + { + "epoch": 1.3278058423017791, + "grad_norm": 2.14347505569458, + "learning_rate": 1e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.7222583889961243, + "num_tokens": 305833211.0, + "step": 12091 + }, + { + "epoch": 1.3279156600043927, + "grad_norm": 2.3794708251953125, + "learning_rate": 1e-06, + "loss": 0.8903, + "mean_token_accuracy": 0.7290167808532715, + "num_tokens": 305856744.0, + "step": 12092 + }, + { + "epoch": 1.3280254777070064, + "grad_norm": 2.0813252925872803, + "learning_rate": 1e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.7246586084365845, + "num_tokens": 305887019.0, + "step": 12093 + }, + { + "epoch": 1.32813529540962, + "grad_norm": 2.1300580501556396, + "learning_rate": 1e-06, + "loss": 0.8896, + "mean_token_accuracy": 0.7212780714035034, + "num_tokens": 305913273.0, + "step": 12094 + }, + { + "epoch": 1.3282451131122337, + "grad_norm": 2.26558256149292, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7075990438461304, + "num_tokens": 305940614.0, + "step": 12095 + }, + { + "epoch": 1.3283549308148475, + "grad_norm": 2.221454381942749, + "learning_rate": 1e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.710604727268219, + "num_tokens": 305968720.0, + "step": 12096 + }, + { + "epoch": 1.328464748517461, + "grad_norm": 2.115417957305908, + "learning_rate": 1e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.6910301446914673, + "num_tokens": 305998633.0, + "step": 12097 + }, + { + "epoch": 1.3285745662200747, + "grad_norm": 2.4377832412719727, + "learning_rate": 1e-06, + "loss": 0.869, + "mean_token_accuracy": 0.7212040424346924, + "num_tokens": 306021376.0, + "step": 12098 + }, + { + "epoch": 1.3286843839226883, + "grad_norm": 2.2854855060577393, + "learning_rate": 1e-06, + "loss": 0.8247, + "mean_token_accuracy": 0.7370386123657227, + "num_tokens": 306045197.0, + "step": 12099 + }, + { + "epoch": 1.328794201625302, + "grad_norm": 2.6053993701934814, + "learning_rate": 1e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.7202174663543701, + "num_tokens": 306064849.0, + "step": 12100 + }, + { + "epoch": 1.3289040193279158, + "grad_norm": 2.463831663131714, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.7179443836212158, + "num_tokens": 306086711.0, + "step": 12101 + }, + { + "epoch": 1.3290138370305293, + "grad_norm": 2.216198682785034, + "learning_rate": 1e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.7146774530410767, + "num_tokens": 306111694.0, + "step": 12102 + }, + { + "epoch": 1.3291236547331429, + "grad_norm": 2.0197088718414307, + "learning_rate": 1e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.7117498517036438, + "num_tokens": 306142060.0, + "step": 12103 + }, + { + "epoch": 1.3292334724357566, + "grad_norm": 2.5236103534698486, + "learning_rate": 1e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7090747356414795, + "num_tokens": 306164601.0, + "step": 12104 + }, + { + "epoch": 1.3293432901383704, + "grad_norm": 2.2135229110717773, + "learning_rate": 1e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7250922918319702, + "num_tokens": 306191830.0, + "step": 12105 + }, + { + "epoch": 1.329453107840984, + "grad_norm": 2.3109612464904785, + "learning_rate": 1e-06, + "loss": 0.8865, + "mean_token_accuracy": 0.7173473238945007, + "num_tokens": 306216149.0, + "step": 12106 + }, + { + "epoch": 1.3295629255435977, + "grad_norm": 2.0777058601379395, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7184118628501892, + "num_tokens": 306245666.0, + "step": 12107 + }, + { + "epoch": 1.3296727432462112, + "grad_norm": 2.3861732482910156, + "learning_rate": 1e-06, + "loss": 0.9073, + "mean_token_accuracy": 0.7177457213401794, + "num_tokens": 306269578.0, + "step": 12108 + }, + { + "epoch": 1.329782560948825, + "grad_norm": 2.2515082359313965, + "learning_rate": 1e-06, + "loss": 0.9041, + "mean_token_accuracy": 0.7216638326644897, + "num_tokens": 306294711.0, + "step": 12109 + }, + { + "epoch": 1.3298923786514387, + "grad_norm": 2.6161041259765625, + "learning_rate": 1e-06, + "loss": 0.8423, + "mean_token_accuracy": 0.7285999655723572, + "num_tokens": 306313824.0, + "step": 12110 + }, + { + "epoch": 1.3300021963540523, + "grad_norm": 2.5186514854431152, + "learning_rate": 1e-06, + "loss": 0.8983, + "mean_token_accuracy": 0.7225354909896851, + "num_tokens": 306334606.0, + "step": 12111 + }, + { + "epoch": 1.330112014056666, + "grad_norm": 2.3459367752075195, + "learning_rate": 1e-06, + "loss": 0.8632, + "mean_token_accuracy": 0.7251745462417603, + "num_tokens": 306357886.0, + "step": 12112 + }, + { + "epoch": 1.3302218317592795, + "grad_norm": 2.424776554107666, + "learning_rate": 1e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7160285711288452, + "num_tokens": 306382459.0, + "step": 12113 + }, + { + "epoch": 1.3303316494618933, + "grad_norm": 2.5553691387176514, + "learning_rate": 1e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.703366756439209, + "num_tokens": 306405666.0, + "step": 12114 + }, + { + "epoch": 1.3304414671645068, + "grad_norm": 2.3293094635009766, + "learning_rate": 1e-06, + "loss": 0.8927, + "mean_token_accuracy": 0.725951075553894, + "num_tokens": 306429721.0, + "step": 12115 + }, + { + "epoch": 1.3305512848671206, + "grad_norm": 2.3688642978668213, + "learning_rate": 1e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.7049875855445862, + "num_tokens": 306455475.0, + "step": 12116 + }, + { + "epoch": 1.3306611025697341, + "grad_norm": 2.227165699005127, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7074639201164246, + "num_tokens": 306481333.0, + "step": 12117 + }, + { + "epoch": 1.330770920272348, + "grad_norm": 2.2075514793395996, + "learning_rate": 1e-06, + "loss": 0.8834, + "mean_token_accuracy": 0.7185741662979126, + "num_tokens": 306508857.0, + "step": 12118 + }, + { + "epoch": 1.3308807379749616, + "grad_norm": 2.5549697875976562, + "learning_rate": 1e-06, + "loss": 0.8066, + "mean_token_accuracy": 0.7403923273086548, + "num_tokens": 306528134.0, + "step": 12119 + }, + { + "epoch": 1.3309905556775752, + "grad_norm": 2.374185800552368, + "learning_rate": 1e-06, + "loss": 0.9841, + "mean_token_accuracy": 0.6968470811843872, + "num_tokens": 306553696.0, + "step": 12120 + }, + { + "epoch": 1.331100373380189, + "grad_norm": 2.033360242843628, + "learning_rate": 1e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.7079934477806091, + "num_tokens": 306583835.0, + "step": 12121 + }, + { + "epoch": 1.3312101910828025, + "grad_norm": 2.349445343017578, + "learning_rate": 1e-06, + "loss": 0.907, + "mean_token_accuracy": 0.7194899320602417, + "num_tokens": 306606645.0, + "step": 12122 + }, + { + "epoch": 1.3313200087854162, + "grad_norm": 2.2276735305786133, + "learning_rate": 1e-06, + "loss": 0.8634, + "mean_token_accuracy": 0.7297155857086182, + "num_tokens": 306631354.0, + "step": 12123 + }, + { + "epoch": 1.33142982648803, + "grad_norm": 2.0687780380249023, + "learning_rate": 1e-06, + "loss": 0.907, + "mean_token_accuracy": 0.7161563634872437, + "num_tokens": 306659471.0, + "step": 12124 + }, + { + "epoch": 1.3315396441906435, + "grad_norm": 2.1480350494384766, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.7006096839904785, + "num_tokens": 306688103.0, + "step": 12125 + }, + { + "epoch": 1.331649461893257, + "grad_norm": 2.5239624977111816, + "learning_rate": 1e-06, + "loss": 0.8711, + "mean_token_accuracy": 0.7198618650436401, + "num_tokens": 306709617.0, + "step": 12126 + }, + { + "epoch": 1.3317592795958708, + "grad_norm": 1.8425348997116089, + "learning_rate": 1e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.7056344747543335, + "num_tokens": 306742706.0, + "step": 12127 + }, + { + "epoch": 1.3318690972984846, + "grad_norm": 1.9502068758010864, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7208274006843567, + "num_tokens": 306774153.0, + "step": 12128 + }, + { + "epoch": 1.331978915001098, + "grad_norm": 2.1339833736419678, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7135052680969238, + "num_tokens": 306801548.0, + "step": 12129 + }, + { + "epoch": 1.3320887327037119, + "grad_norm": 2.234912157058716, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7147976756095886, + "num_tokens": 306828719.0, + "step": 12130 + }, + { + "epoch": 1.3321985504063254, + "grad_norm": 2.2264440059661865, + "learning_rate": 1e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.7111707329750061, + "num_tokens": 306856228.0, + "step": 12131 + }, + { + "epoch": 1.3323083681089392, + "grad_norm": 1.9681395292282104, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.711036741733551, + "num_tokens": 306890004.0, + "step": 12132 + }, + { + "epoch": 1.332418185811553, + "grad_norm": 2.1952781677246094, + "learning_rate": 1e-06, + "loss": 0.9586, + "mean_token_accuracy": 0.7103284597396851, + "num_tokens": 306917803.0, + "step": 12133 + }, + { + "epoch": 1.3325280035141664, + "grad_norm": 2.078125238418579, + "learning_rate": 1e-06, + "loss": 0.9857, + "mean_token_accuracy": 0.7050453424453735, + "num_tokens": 306948775.0, + "step": 12134 + }, + { + "epoch": 1.3326378212167802, + "grad_norm": 2.3673202991485596, + "learning_rate": 1e-06, + "loss": 1.0367, + "mean_token_accuracy": 0.6931321620941162, + "num_tokens": 306974026.0, + "step": 12135 + }, + { + "epoch": 1.3327476389193937, + "grad_norm": 2.158208131790161, + "learning_rate": 1e-06, + "loss": 0.9594, + "mean_token_accuracy": 0.7078187465667725, + "num_tokens": 307002922.0, + "step": 12136 + }, + { + "epoch": 1.3328574566220075, + "grad_norm": 2.1540677547454834, + "learning_rate": 1e-06, + "loss": 0.9014, + "mean_token_accuracy": 0.7129714488983154, + "num_tokens": 307031556.0, + "step": 12137 + }, + { + "epoch": 1.3329672743246213, + "grad_norm": 2.394721508026123, + "learning_rate": 1e-06, + "loss": 0.8268, + "mean_token_accuracy": 0.7363686561584473, + "num_tokens": 307054396.0, + "step": 12138 + }, + { + "epoch": 1.3330770920272348, + "grad_norm": 2.406161308288574, + "learning_rate": 1e-06, + "loss": 0.8849, + "mean_token_accuracy": 0.7228924036026001, + "num_tokens": 307076777.0, + "step": 12139 + }, + { + "epoch": 1.3331869097298483, + "grad_norm": 1.9766631126403809, + "learning_rate": 1e-06, + "loss": 0.8383, + "mean_token_accuracy": 0.7385945320129395, + "num_tokens": 307107503.0, + "step": 12140 + }, + { + "epoch": 1.333296727432462, + "grad_norm": 2.028381824493408, + "learning_rate": 1e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.7098156213760376, + "num_tokens": 307137106.0, + "step": 12141 + }, + { + "epoch": 1.3334065451350758, + "grad_norm": 2.110361099243164, + "learning_rate": 1e-06, + "loss": 0.902, + "mean_token_accuracy": 0.7257061004638672, + "num_tokens": 307164884.0, + "step": 12142 + }, + { + "epoch": 1.3335163628376894, + "grad_norm": 2.450124979019165, + "learning_rate": 1e-06, + "loss": 0.9635, + "mean_token_accuracy": 0.7022722363471985, + "num_tokens": 307186786.0, + "step": 12143 + }, + { + "epoch": 1.3336261805403031, + "grad_norm": 2.1895499229431152, + "learning_rate": 1e-06, + "loss": 0.811, + "mean_token_accuracy": 0.7472780346870422, + "num_tokens": 307211476.0, + "step": 12144 + }, + { + "epoch": 1.3337359982429167, + "grad_norm": 2.1468923091888428, + "learning_rate": 1e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.7230561971664429, + "num_tokens": 307239002.0, + "step": 12145 + }, + { + "epoch": 1.3338458159455304, + "grad_norm": 2.178450345993042, + "learning_rate": 1e-06, + "loss": 0.991, + "mean_token_accuracy": 0.7043191194534302, + "num_tokens": 307266298.0, + "step": 12146 + }, + { + "epoch": 1.3339556336481442, + "grad_norm": 2.3292133808135986, + "learning_rate": 1e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7310897707939148, + "num_tokens": 307290468.0, + "step": 12147 + }, + { + "epoch": 1.3340654513507577, + "grad_norm": 2.4302148818969727, + "learning_rate": 1e-06, + "loss": 0.8367, + "mean_token_accuracy": 0.7380279302597046, + "num_tokens": 307312947.0, + "step": 12148 + }, + { + "epoch": 1.3341752690533715, + "grad_norm": 2.078702688217163, + "learning_rate": 1e-06, + "loss": 0.9982, + "mean_token_accuracy": 0.6903153657913208, + "num_tokens": 307341058.0, + "step": 12149 + }, + { + "epoch": 1.334285086755985, + "grad_norm": 2.106680154800415, + "learning_rate": 1e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.6917933225631714, + "num_tokens": 307369832.0, + "step": 12150 + }, + { + "epoch": 1.3343949044585988, + "grad_norm": 2.2437551021575928, + "learning_rate": 1e-06, + "loss": 0.8699, + "mean_token_accuracy": 0.7265589237213135, + "num_tokens": 307394706.0, + "step": 12151 + }, + { + "epoch": 1.3345047221612125, + "grad_norm": 2.233549118041992, + "learning_rate": 1e-06, + "loss": 0.9969, + "mean_token_accuracy": 0.7071270942687988, + "num_tokens": 307419451.0, + "step": 12152 + }, + { + "epoch": 1.334614539863826, + "grad_norm": 2.541107177734375, + "learning_rate": 1e-06, + "loss": 0.8524, + "mean_token_accuracy": 0.7422624826431274, + "num_tokens": 307439684.0, + "step": 12153 + }, + { + "epoch": 1.3347243575664396, + "grad_norm": 2.3729825019836426, + "learning_rate": 1e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7179642915725708, + "num_tokens": 307464715.0, + "step": 12154 + }, + { + "epoch": 1.3348341752690533, + "grad_norm": 2.6499040126800537, + "learning_rate": 1e-06, + "loss": 0.8523, + "mean_token_accuracy": 0.738746166229248, + "num_tokens": 307483577.0, + "step": 12155 + }, + { + "epoch": 1.334943992971667, + "grad_norm": 2.569340944290161, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.7127285003662109, + "num_tokens": 307504300.0, + "step": 12156 + }, + { + "epoch": 1.3350538106742806, + "grad_norm": 2.149540662765503, + "learning_rate": 1e-06, + "loss": 0.822, + "mean_token_accuracy": 0.7367599606513977, + "num_tokens": 307529967.0, + "step": 12157 + }, + { + "epoch": 1.3351636283768944, + "grad_norm": 2.34633731842041, + "learning_rate": 1e-06, + "loss": 0.855, + "mean_token_accuracy": 0.7298259735107422, + "num_tokens": 307551860.0, + "step": 12158 + }, + { + "epoch": 1.335273446079508, + "grad_norm": 2.267026901245117, + "learning_rate": 1e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.6921645402908325, + "num_tokens": 307576904.0, + "step": 12159 + }, + { + "epoch": 1.3353832637821217, + "grad_norm": 2.4927968978881836, + "learning_rate": 1e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.7048850655555725, + "num_tokens": 307598754.0, + "step": 12160 + }, + { + "epoch": 1.3354930814847354, + "grad_norm": 2.225865125656128, + "learning_rate": 1e-06, + "loss": 0.8978, + "mean_token_accuracy": 0.7200843691825867, + "num_tokens": 307624022.0, + "step": 12161 + }, + { + "epoch": 1.335602899187349, + "grad_norm": 2.125809907913208, + "learning_rate": 1e-06, + "loss": 0.9253, + "mean_token_accuracy": 0.7087153792381287, + "num_tokens": 307653795.0, + "step": 12162 + }, + { + "epoch": 1.3357127168899627, + "grad_norm": 2.3206393718719482, + "learning_rate": 1e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.7183336019515991, + "num_tokens": 307680023.0, + "step": 12163 + }, + { + "epoch": 1.3358225345925763, + "grad_norm": 2.278597831726074, + "learning_rate": 1e-06, + "loss": 0.9553, + "mean_token_accuracy": 0.71275395154953, + "num_tokens": 307706420.0, + "step": 12164 + }, + { + "epoch": 1.33593235229519, + "grad_norm": 2.244276523590088, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7165417671203613, + "num_tokens": 307732215.0, + "step": 12165 + }, + { + "epoch": 1.3360421699978036, + "grad_norm": 1.9799433946609497, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7184308171272278, + "num_tokens": 307764483.0, + "step": 12166 + }, + { + "epoch": 1.3361519877004173, + "grad_norm": 1.9686237573623657, + "learning_rate": 1e-06, + "loss": 0.9909, + "mean_token_accuracy": 0.6962645053863525, + "num_tokens": 307796255.0, + "step": 12167 + }, + { + "epoch": 1.3362618054030309, + "grad_norm": 2.0115818977355957, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7125211954116821, + "num_tokens": 307827399.0, + "step": 12168 + }, + { + "epoch": 1.3363716231056446, + "grad_norm": 2.6029136180877686, + "learning_rate": 1e-06, + "loss": 0.8855, + "mean_token_accuracy": 0.7278201580047607, + "num_tokens": 307848771.0, + "step": 12169 + }, + { + "epoch": 1.3364814408082584, + "grad_norm": 2.054471015930176, + "learning_rate": 1e-06, + "loss": 1.0052, + "mean_token_accuracy": 0.6926019191741943, + "num_tokens": 307880755.0, + "step": 12170 + }, + { + "epoch": 1.336591258510872, + "grad_norm": 1.9906938076019287, + "learning_rate": 1e-06, + "loss": 0.9902, + "mean_token_accuracy": 0.6970805525779724, + "num_tokens": 307912747.0, + "step": 12171 + }, + { + "epoch": 1.3367010762134857, + "grad_norm": 2.7723968029022217, + "learning_rate": 1e-06, + "loss": 0.872, + "mean_token_accuracy": 0.7369022369384766, + "num_tokens": 307932046.0, + "step": 12172 + }, + { + "epoch": 1.3368108939160992, + "grad_norm": 1.9644652605056763, + "learning_rate": 1e-06, + "loss": 0.9798, + "mean_token_accuracy": 0.6972795724868774, + "num_tokens": 307964823.0, + "step": 12173 + }, + { + "epoch": 1.336920711618713, + "grad_norm": 2.100278377532959, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7123514413833618, + "num_tokens": 307993992.0, + "step": 12174 + }, + { + "epoch": 1.3370305293213267, + "grad_norm": 2.6180901527404785, + "learning_rate": 1e-06, + "loss": 0.8278, + "mean_token_accuracy": 0.7331355214118958, + "num_tokens": 308013353.0, + "step": 12175 + }, + { + "epoch": 1.3371403470239402, + "grad_norm": 2.264033794403076, + "learning_rate": 1e-06, + "loss": 0.8843, + "mean_token_accuracy": 0.7237662076950073, + "num_tokens": 308037239.0, + "step": 12176 + }, + { + "epoch": 1.337250164726554, + "grad_norm": 2.0266425609588623, + "learning_rate": 1e-06, + "loss": 0.8069, + "mean_token_accuracy": 0.741258978843689, + "num_tokens": 308063779.0, + "step": 12177 + }, + { + "epoch": 1.3373599824291675, + "grad_norm": 2.3267829418182373, + "learning_rate": 1e-06, + "loss": 0.8744, + "mean_token_accuracy": 0.7280400991439819, + "num_tokens": 308088350.0, + "step": 12178 + }, + { + "epoch": 1.3374698001317813, + "grad_norm": 2.248227834701538, + "learning_rate": 1e-06, + "loss": 0.9012, + "mean_token_accuracy": 0.7121906876564026, + "num_tokens": 308112521.0, + "step": 12179 + }, + { + "epoch": 1.3375796178343948, + "grad_norm": 1.9079071283340454, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7046507596969604, + "num_tokens": 308146360.0, + "step": 12180 + }, + { + "epoch": 1.3376894355370086, + "grad_norm": 2.0653927326202393, + "learning_rate": 1e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.7335221767425537, + "num_tokens": 308173957.0, + "step": 12181 + }, + { + "epoch": 1.3377992532396221, + "grad_norm": 2.309584617614746, + "learning_rate": 1e-06, + "loss": 0.9523, + "mean_token_accuracy": 0.7082793712615967, + "num_tokens": 308198066.0, + "step": 12182 + }, + { + "epoch": 1.3379090709422359, + "grad_norm": 2.4454872608184814, + "learning_rate": 1e-06, + "loss": 0.8732, + "mean_token_accuracy": 0.7320058345794678, + "num_tokens": 308220307.0, + "step": 12183 + }, + { + "epoch": 1.3380188886448496, + "grad_norm": 2.250485897064209, + "learning_rate": 1e-06, + "loss": 0.8972, + "mean_token_accuracy": 0.7150338292121887, + "num_tokens": 308243590.0, + "step": 12184 + }, + { + "epoch": 1.3381287063474632, + "grad_norm": 2.1372363567352295, + "learning_rate": 1e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.7083922028541565, + "num_tokens": 308271544.0, + "step": 12185 + }, + { + "epoch": 1.338238524050077, + "grad_norm": 2.536320686340332, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.7192999124526978, + "num_tokens": 308292109.0, + "step": 12186 + }, + { + "epoch": 1.3383483417526905, + "grad_norm": 2.246702194213867, + "learning_rate": 1e-06, + "loss": 0.8584, + "mean_token_accuracy": 0.7301274538040161, + "num_tokens": 308318395.0, + "step": 12187 + }, + { + "epoch": 1.3384581594553042, + "grad_norm": 2.1110594272613525, + "learning_rate": 1e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7233933210372925, + "num_tokens": 308345406.0, + "step": 12188 + }, + { + "epoch": 1.338567977157918, + "grad_norm": 2.211733818054199, + "learning_rate": 1e-06, + "loss": 0.8195, + "mean_token_accuracy": 0.7534786462783813, + "num_tokens": 308370882.0, + "step": 12189 + }, + { + "epoch": 1.3386777948605315, + "grad_norm": 2.228459596633911, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7195176482200623, + "num_tokens": 308396628.0, + "step": 12190 + }, + { + "epoch": 1.338787612563145, + "grad_norm": 2.5478968620300293, + "learning_rate": 1e-06, + "loss": 0.8722, + "mean_token_accuracy": 0.7257483005523682, + "num_tokens": 308417296.0, + "step": 12191 + }, + { + "epoch": 1.3388974302657588, + "grad_norm": 2.210096597671509, + "learning_rate": 1e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.7215956449508667, + "num_tokens": 308443921.0, + "step": 12192 + }, + { + "epoch": 1.3390072479683726, + "grad_norm": 1.935951828956604, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7259889245033264, + "num_tokens": 308478566.0, + "step": 12193 + }, + { + "epoch": 1.339117065670986, + "grad_norm": 2.4142251014709473, + "learning_rate": 1e-06, + "loss": 0.8726, + "mean_token_accuracy": 0.7276291847229004, + "num_tokens": 308501249.0, + "step": 12194 + }, + { + "epoch": 1.3392268833735999, + "grad_norm": 2.2778546810150146, + "learning_rate": 1e-06, + "loss": 0.8875, + "mean_token_accuracy": 0.7216154336929321, + "num_tokens": 308527054.0, + "step": 12195 + }, + { + "epoch": 1.3393367010762134, + "grad_norm": 2.351423978805542, + "learning_rate": 1e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.7054493427276611, + "num_tokens": 308550752.0, + "step": 12196 + }, + { + "epoch": 1.3394465187788271, + "grad_norm": 2.373134136199951, + "learning_rate": 1e-06, + "loss": 0.883, + "mean_token_accuracy": 0.7359023094177246, + "num_tokens": 308575463.0, + "step": 12197 + }, + { + "epoch": 1.339556336481441, + "grad_norm": 2.657668113708496, + "learning_rate": 1e-06, + "loss": 0.8358, + "mean_token_accuracy": 0.7387935519218445, + "num_tokens": 308593879.0, + "step": 12198 + }, + { + "epoch": 1.3396661541840544, + "grad_norm": 1.9896858930587769, + "learning_rate": 1e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.6869677305221558, + "num_tokens": 308626943.0, + "step": 12199 + }, + { + "epoch": 1.3397759718866682, + "grad_norm": 2.254230499267578, + "learning_rate": 1e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7164784669876099, + "num_tokens": 308652849.0, + "step": 12200 + }, + { + "epoch": 1.3398857895892817, + "grad_norm": 2.119901657104492, + "learning_rate": 1e-06, + "loss": 0.9261, + "mean_token_accuracy": 0.7151495218276978, + "num_tokens": 308681229.0, + "step": 12201 + }, + { + "epoch": 1.3399956072918955, + "grad_norm": 2.0543105602264404, + "learning_rate": 1e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.707675576210022, + "num_tokens": 308713389.0, + "step": 12202 + }, + { + "epoch": 1.3401054249945092, + "grad_norm": 2.699352264404297, + "learning_rate": 1e-06, + "loss": 0.8024, + "mean_token_accuracy": 0.7530927062034607, + "num_tokens": 308733123.0, + "step": 12203 + }, + { + "epoch": 1.3402152426971228, + "grad_norm": 2.3104424476623535, + "learning_rate": 1e-06, + "loss": 0.8984, + "mean_token_accuracy": 0.7188832759857178, + "num_tokens": 308758483.0, + "step": 12204 + }, + { + "epoch": 1.3403250603997363, + "grad_norm": 2.330336332321167, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7135682702064514, + "num_tokens": 308783401.0, + "step": 12205 + }, + { + "epoch": 1.34043487810235, + "grad_norm": 2.4303672313690186, + "learning_rate": 1e-06, + "loss": 0.8689, + "mean_token_accuracy": 0.7265653014183044, + "num_tokens": 308805134.0, + "step": 12206 + }, + { + "epoch": 1.3405446958049638, + "grad_norm": 2.2598698139190674, + "learning_rate": 1e-06, + "loss": 0.9612, + "mean_token_accuracy": 0.7073423862457275, + "num_tokens": 308831581.0, + "step": 12207 + }, + { + "epoch": 1.3406545135075774, + "grad_norm": 2.3014466762542725, + "learning_rate": 1e-06, + "loss": 0.8354, + "mean_token_accuracy": 0.7444127798080444, + "num_tokens": 308858748.0, + "step": 12208 + }, + { + "epoch": 1.3407643312101911, + "grad_norm": 2.4894535541534424, + "learning_rate": 1e-06, + "loss": 0.7901, + "mean_token_accuracy": 0.7515555024147034, + "num_tokens": 308880942.0, + "step": 12209 + }, + { + "epoch": 1.3408741489128047, + "grad_norm": 2.44655179977417, + "learning_rate": 1e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7234796285629272, + "num_tokens": 308903348.0, + "step": 12210 + }, + { + "epoch": 1.3409839666154184, + "grad_norm": 2.0408520698547363, + "learning_rate": 1e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.7159748077392578, + "num_tokens": 308930878.0, + "step": 12211 + }, + { + "epoch": 1.3410937843180322, + "grad_norm": 2.386902093887329, + "learning_rate": 1e-06, + "loss": 0.8617, + "mean_token_accuracy": 0.7304064631462097, + "num_tokens": 308954080.0, + "step": 12212 + }, + { + "epoch": 1.3412036020206457, + "grad_norm": 2.3571929931640625, + "learning_rate": 1e-06, + "loss": 0.8383, + "mean_token_accuracy": 0.7352869510650635, + "num_tokens": 308977087.0, + "step": 12213 + }, + { + "epoch": 1.3413134197232595, + "grad_norm": 2.3010923862457275, + "learning_rate": 1e-06, + "loss": 0.8276, + "mean_token_accuracy": 0.7472556233406067, + "num_tokens": 309001279.0, + "step": 12214 + }, + { + "epoch": 1.341423237425873, + "grad_norm": 1.9744060039520264, + "learning_rate": 1e-06, + "loss": 0.977, + "mean_token_accuracy": 0.6981947422027588, + "num_tokens": 309031698.0, + "step": 12215 + }, + { + "epoch": 1.3415330551284868, + "grad_norm": 2.2412314414978027, + "learning_rate": 1e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7156935930252075, + "num_tokens": 309056493.0, + "step": 12216 + }, + { + "epoch": 1.3416428728311005, + "grad_norm": 2.1033878326416016, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.7051150798797607, + "num_tokens": 309085438.0, + "step": 12217 + }, + { + "epoch": 1.341752690533714, + "grad_norm": 2.8523356914520264, + "learning_rate": 1e-06, + "loss": 0.8903, + "mean_token_accuracy": 0.7238492965698242, + "num_tokens": 309102190.0, + "step": 12218 + }, + { + "epoch": 1.3418625082363276, + "grad_norm": 2.1303117275238037, + "learning_rate": 1e-06, + "loss": 0.8782, + "mean_token_accuracy": 0.7248964905738831, + "num_tokens": 309129789.0, + "step": 12219 + }, + { + "epoch": 1.3419723259389413, + "grad_norm": 2.4056241512298584, + "learning_rate": 1e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.7089558839797974, + "num_tokens": 309152941.0, + "step": 12220 + }, + { + "epoch": 1.342082143641555, + "grad_norm": 2.4733800888061523, + "learning_rate": 1e-06, + "loss": 0.9753, + "mean_token_accuracy": 0.7192564010620117, + "num_tokens": 309175734.0, + "step": 12221 + }, + { + "epoch": 1.3421919613441686, + "grad_norm": 1.9220881462097168, + "learning_rate": 1e-06, + "loss": 1.0255, + "mean_token_accuracy": 0.6929734945297241, + "num_tokens": 309208544.0, + "step": 12222 + }, + { + "epoch": 1.3423017790467824, + "grad_norm": 2.2391860485076904, + "learning_rate": 1e-06, + "loss": 0.8326, + "mean_token_accuracy": 0.7415453791618347, + "num_tokens": 309232801.0, + "step": 12223 + }, + { + "epoch": 1.342411596749396, + "grad_norm": 2.125333786010742, + "learning_rate": 1e-06, + "loss": 0.8718, + "mean_token_accuracy": 0.7312232255935669, + "num_tokens": 309258139.0, + "step": 12224 + }, + { + "epoch": 1.3425214144520097, + "grad_norm": 2.235271453857422, + "learning_rate": 1e-06, + "loss": 0.8106, + "mean_token_accuracy": 0.7424055337905884, + "num_tokens": 309283039.0, + "step": 12225 + }, + { + "epoch": 1.3426312321546234, + "grad_norm": 2.549037218093872, + "learning_rate": 1e-06, + "loss": 0.908, + "mean_token_accuracy": 0.7193224430084229, + "num_tokens": 309306481.0, + "step": 12226 + }, + { + "epoch": 1.342741049857237, + "grad_norm": 2.377239227294922, + "learning_rate": 1e-06, + "loss": 0.9065, + "mean_token_accuracy": 0.71791672706604, + "num_tokens": 309331578.0, + "step": 12227 + }, + { + "epoch": 1.3428508675598507, + "grad_norm": 2.6743004322052, + "learning_rate": 1e-06, + "loss": 0.8669, + "mean_token_accuracy": 0.7237284183502197, + "num_tokens": 309350358.0, + "step": 12228 + }, + { + "epoch": 1.3429606852624643, + "grad_norm": 2.208919048309326, + "learning_rate": 1e-06, + "loss": 0.8892, + "mean_token_accuracy": 0.7227658033370972, + "num_tokens": 309379598.0, + "step": 12229 + }, + { + "epoch": 1.343070502965078, + "grad_norm": 2.403996467590332, + "learning_rate": 1e-06, + "loss": 0.8724, + "mean_token_accuracy": 0.7240227460861206, + "num_tokens": 309402776.0, + "step": 12230 + }, + { + "epoch": 1.3431803206676916, + "grad_norm": 2.373479127883911, + "learning_rate": 1e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.7274129390716553, + "num_tokens": 309426335.0, + "step": 12231 + }, + { + "epoch": 1.3432901383703053, + "grad_norm": 2.4042656421661377, + "learning_rate": 1e-06, + "loss": 0.8272, + "mean_token_accuracy": 0.7311617136001587, + "num_tokens": 309448562.0, + "step": 12232 + }, + { + "epoch": 1.3433999560729188, + "grad_norm": 2.15159273147583, + "learning_rate": 1e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7113898992538452, + "num_tokens": 309478590.0, + "step": 12233 + }, + { + "epoch": 1.3435097737755326, + "grad_norm": 2.0627059936523438, + "learning_rate": 1e-06, + "loss": 0.949, + "mean_token_accuracy": 0.7011404037475586, + "num_tokens": 309506892.0, + "step": 12234 + }, + { + "epoch": 1.3436195914781464, + "grad_norm": 1.9774575233459473, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.7125037908554077, + "num_tokens": 309537331.0, + "step": 12235 + }, + { + "epoch": 1.34372940918076, + "grad_norm": 2.205791711807251, + "learning_rate": 1e-06, + "loss": 0.9203, + "mean_token_accuracy": 0.7122231125831604, + "num_tokens": 309563672.0, + "step": 12236 + }, + { + "epoch": 1.3438392268833736, + "grad_norm": 2.3711204528808594, + "learning_rate": 1e-06, + "loss": 0.8428, + "mean_token_accuracy": 0.7287068367004395, + "num_tokens": 309586965.0, + "step": 12237 + }, + { + "epoch": 1.3439490445859872, + "grad_norm": 2.2880380153656006, + "learning_rate": 1e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.6972255706787109, + "num_tokens": 309614623.0, + "step": 12238 + }, + { + "epoch": 1.344058862288601, + "grad_norm": 1.9481768608093262, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7114962339401245, + "num_tokens": 309648514.0, + "step": 12239 + }, + { + "epoch": 1.3441686799912147, + "grad_norm": 2.156061887741089, + "learning_rate": 1e-06, + "loss": 0.911, + "mean_token_accuracy": 0.7175893783569336, + "num_tokens": 309676354.0, + "step": 12240 + }, + { + "epoch": 1.3442784976938282, + "grad_norm": 2.16774320602417, + "learning_rate": 1e-06, + "loss": 1.0107, + "mean_token_accuracy": 0.6894617080688477, + "num_tokens": 309703565.0, + "step": 12241 + }, + { + "epoch": 1.3443883153964418, + "grad_norm": 2.2849948406219482, + "learning_rate": 1e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.7085554599761963, + "num_tokens": 309728156.0, + "step": 12242 + }, + { + "epoch": 1.3444981330990555, + "grad_norm": 2.714036226272583, + "learning_rate": 1e-06, + "loss": 0.7948, + "mean_token_accuracy": 0.7498180866241455, + "num_tokens": 309747080.0, + "step": 12243 + }, + { + "epoch": 1.3446079508016693, + "grad_norm": 1.9776817560195923, + "learning_rate": 1e-06, + "loss": 0.9868, + "mean_token_accuracy": 0.7013634443283081, + "num_tokens": 309780051.0, + "step": 12244 + }, + { + "epoch": 1.3447177685042828, + "grad_norm": 2.2090368270874023, + "learning_rate": 1e-06, + "loss": 0.9024, + "mean_token_accuracy": 0.7186523675918579, + "num_tokens": 309806693.0, + "step": 12245 + }, + { + "epoch": 1.3448275862068966, + "grad_norm": 2.4424610137939453, + "learning_rate": 1e-06, + "loss": 0.8345, + "mean_token_accuracy": 0.739324688911438, + "num_tokens": 309827530.0, + "step": 12246 + }, + { + "epoch": 1.34493740390951, + "grad_norm": 2.369208335876465, + "learning_rate": 1e-06, + "loss": 0.9409, + "mean_token_accuracy": 0.7204817533493042, + "num_tokens": 309852329.0, + "step": 12247 + }, + { + "epoch": 1.3450472216121239, + "grad_norm": 2.379439353942871, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.6996731758117676, + "num_tokens": 309878054.0, + "step": 12248 + }, + { + "epoch": 1.3451570393147376, + "grad_norm": 2.0970747470855713, + "learning_rate": 1e-06, + "loss": 1.035, + "mean_token_accuracy": 0.6874375343322754, + "num_tokens": 309908720.0, + "step": 12249 + }, + { + "epoch": 1.3452668570173512, + "grad_norm": 2.126476526260376, + "learning_rate": 1e-06, + "loss": 0.9746, + "mean_token_accuracy": 0.7039279937744141, + "num_tokens": 309939675.0, + "step": 12250 + }, + { + "epoch": 1.345376674719965, + "grad_norm": 2.658017158508301, + "learning_rate": 1e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.7433634996414185, + "num_tokens": 309960372.0, + "step": 12251 + }, + { + "epoch": 1.3454864924225785, + "grad_norm": 2.3843743801116943, + "learning_rate": 1e-06, + "loss": 0.9071, + "mean_token_accuracy": 0.7188770174980164, + "num_tokens": 309985193.0, + "step": 12252 + }, + { + "epoch": 1.3455963101251922, + "grad_norm": 2.1560218334198, + "learning_rate": 1e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.72307288646698, + "num_tokens": 310011861.0, + "step": 12253 + }, + { + "epoch": 1.345706127827806, + "grad_norm": 2.3903145790100098, + "learning_rate": 1e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7133550643920898, + "num_tokens": 310034039.0, + "step": 12254 + }, + { + "epoch": 1.3458159455304195, + "grad_norm": 2.4379444122314453, + "learning_rate": 1e-06, + "loss": 0.822, + "mean_token_accuracy": 0.7427979111671448, + "num_tokens": 310054977.0, + "step": 12255 + }, + { + "epoch": 1.345925763233033, + "grad_norm": 2.371443271636963, + "learning_rate": 1e-06, + "loss": 0.8556, + "mean_token_accuracy": 0.7315821051597595, + "num_tokens": 310078226.0, + "step": 12256 + }, + { + "epoch": 1.3460355809356468, + "grad_norm": 2.349003314971924, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7171863317489624, + "num_tokens": 310101739.0, + "step": 12257 + }, + { + "epoch": 1.3461453986382605, + "grad_norm": 2.2223994731903076, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7144463658332825, + "num_tokens": 310130391.0, + "step": 12258 + }, + { + "epoch": 1.346255216340874, + "grad_norm": 2.3917670249938965, + "learning_rate": 1e-06, + "loss": 0.7912, + "mean_token_accuracy": 0.7532935738563538, + "num_tokens": 310152302.0, + "step": 12259 + }, + { + "epoch": 1.3463650340434878, + "grad_norm": 2.3281068801879883, + "learning_rate": 1e-06, + "loss": 0.8143, + "mean_token_accuracy": 0.7383062839508057, + "num_tokens": 310176110.0, + "step": 12260 + }, + { + "epoch": 1.3464748517461014, + "grad_norm": 2.612551212310791, + "learning_rate": 1e-06, + "loss": 0.861, + "mean_token_accuracy": 0.7280916571617126, + "num_tokens": 310196914.0, + "step": 12261 + }, + { + "epoch": 1.3465846694487151, + "grad_norm": 2.2968335151672363, + "learning_rate": 1e-06, + "loss": 0.9, + "mean_token_accuracy": 0.7240667343139648, + "num_tokens": 310222277.0, + "step": 12262 + }, + { + "epoch": 1.346694487151329, + "grad_norm": 2.0534071922302246, + "learning_rate": 1e-06, + "loss": 0.9362, + "mean_token_accuracy": 0.7073615789413452, + "num_tokens": 310252317.0, + "step": 12263 + }, + { + "epoch": 1.3468043048539424, + "grad_norm": 2.2590901851654053, + "learning_rate": 1e-06, + "loss": 0.9869, + "mean_token_accuracy": 0.7029761672019958, + "num_tokens": 310280442.0, + "step": 12264 + }, + { + "epoch": 1.3469141225565562, + "grad_norm": 2.1630747318267822, + "learning_rate": 1e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.726993203163147, + "num_tokens": 310308838.0, + "step": 12265 + }, + { + "epoch": 1.3470239402591697, + "grad_norm": 2.343823194503784, + "learning_rate": 1e-06, + "loss": 0.9006, + "mean_token_accuracy": 0.7188982963562012, + "num_tokens": 310331869.0, + "step": 12266 + }, + { + "epoch": 1.3471337579617835, + "grad_norm": 2.0541181564331055, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.6937333345413208, + "num_tokens": 310361483.0, + "step": 12267 + }, + { + "epoch": 1.3472435756643972, + "grad_norm": 2.245138168334961, + "learning_rate": 1e-06, + "loss": 0.9953, + "mean_token_accuracy": 0.694966733455658, + "num_tokens": 310388142.0, + "step": 12268 + }, + { + "epoch": 1.3473533933670108, + "grad_norm": 2.5651748180389404, + "learning_rate": 1e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.7005643844604492, + "num_tokens": 310408033.0, + "step": 12269 + }, + { + "epoch": 1.3474632110696243, + "grad_norm": 2.218188762664795, + "learning_rate": 1e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7017909288406372, + "num_tokens": 310434251.0, + "step": 12270 + }, + { + "epoch": 1.347573028772238, + "grad_norm": 1.9182815551757812, + "learning_rate": 1e-06, + "loss": 0.8848, + "mean_token_accuracy": 0.7337862849235535, + "num_tokens": 310466308.0, + "step": 12271 + }, + { + "epoch": 1.3476828464748518, + "grad_norm": 2.345885753631592, + "learning_rate": 1e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.7197613716125488, + "num_tokens": 310491921.0, + "step": 12272 + }, + { + "epoch": 1.3477926641774653, + "grad_norm": 2.272385835647583, + "learning_rate": 1e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.6992765665054321, + "num_tokens": 310517693.0, + "step": 12273 + }, + { + "epoch": 1.347902481880079, + "grad_norm": 2.344392776489258, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7023047208786011, + "num_tokens": 310541736.0, + "step": 12274 + }, + { + "epoch": 1.3480122995826926, + "grad_norm": 2.083878993988037, + "learning_rate": 1e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.7090294361114502, + "num_tokens": 310572005.0, + "step": 12275 + }, + { + "epoch": 1.3481221172853064, + "grad_norm": 2.0144360065460205, + "learning_rate": 1e-06, + "loss": 0.8844, + "mean_token_accuracy": 0.7297205924987793, + "num_tokens": 310601258.0, + "step": 12276 + }, + { + "epoch": 1.3482319349879202, + "grad_norm": 2.2983689308166504, + "learning_rate": 1e-06, + "loss": 0.9014, + "mean_token_accuracy": 0.7211275696754456, + "num_tokens": 310625801.0, + "step": 12277 + }, + { + "epoch": 1.3483417526905337, + "grad_norm": 2.159022569656372, + "learning_rate": 1e-06, + "loss": 0.9863, + "mean_token_accuracy": 0.7026392221450806, + "num_tokens": 310655714.0, + "step": 12278 + }, + { + "epoch": 1.3484515703931474, + "grad_norm": 1.9646844863891602, + "learning_rate": 1e-06, + "loss": 1.0321, + "mean_token_accuracy": 0.6970494389533997, + "num_tokens": 310691108.0, + "step": 12279 + }, + { + "epoch": 1.348561388095761, + "grad_norm": 2.0643932819366455, + "learning_rate": 1e-06, + "loss": 0.9847, + "mean_token_accuracy": 0.6969722509384155, + "num_tokens": 310721946.0, + "step": 12280 + }, + { + "epoch": 1.3486712057983747, + "grad_norm": 2.4758541584014893, + "learning_rate": 1e-06, + "loss": 0.8532, + "mean_token_accuracy": 0.7280610799789429, + "num_tokens": 310742609.0, + "step": 12281 + }, + { + "epoch": 1.3487810235009885, + "grad_norm": 2.172391653060913, + "learning_rate": 1e-06, + "loss": 0.8983, + "mean_token_accuracy": 0.7198340892791748, + "num_tokens": 310768062.0, + "step": 12282 + }, + { + "epoch": 1.348890841203602, + "grad_norm": 2.100236177444458, + "learning_rate": 1e-06, + "loss": 0.8255, + "mean_token_accuracy": 0.7363400459289551, + "num_tokens": 310797000.0, + "step": 12283 + }, + { + "epoch": 1.3490006589062156, + "grad_norm": 2.3322596549987793, + "learning_rate": 1e-06, + "loss": 0.9193, + "mean_token_accuracy": 0.7213025093078613, + "num_tokens": 310820213.0, + "step": 12284 + }, + { + "epoch": 1.3491104766088293, + "grad_norm": 2.099999189376831, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7017732262611389, + "num_tokens": 310848601.0, + "step": 12285 + }, + { + "epoch": 1.349220294311443, + "grad_norm": 2.345407247543335, + "learning_rate": 1e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7047830820083618, + "num_tokens": 310873323.0, + "step": 12286 + }, + { + "epoch": 1.3493301120140566, + "grad_norm": 2.493565320968628, + "learning_rate": 1e-06, + "loss": 0.8301, + "mean_token_accuracy": 0.7345913052558899, + "num_tokens": 310895460.0, + "step": 12287 + }, + { + "epoch": 1.3494399297166704, + "grad_norm": 2.4327938556671143, + "learning_rate": 1e-06, + "loss": 0.7984, + "mean_token_accuracy": 0.7442107200622559, + "num_tokens": 310916404.0, + "step": 12288 + }, + { + "epoch": 1.349549747419284, + "grad_norm": 2.170311450958252, + "learning_rate": 1e-06, + "loss": 0.84, + "mean_token_accuracy": 0.7380456924438477, + "num_tokens": 310940796.0, + "step": 12289 + }, + { + "epoch": 1.3496595651218977, + "grad_norm": 2.1767585277557373, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7083896398544312, + "num_tokens": 310966660.0, + "step": 12290 + }, + { + "epoch": 1.3497693828245114, + "grad_norm": 2.347975730895996, + "learning_rate": 1e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.7222215533256531, + "num_tokens": 310990430.0, + "step": 12291 + }, + { + "epoch": 1.349879200527125, + "grad_norm": 2.1140594482421875, + "learning_rate": 1e-06, + "loss": 0.9038, + "mean_token_accuracy": 0.7217890024185181, + "num_tokens": 311018504.0, + "step": 12292 + }, + { + "epoch": 1.3499890182297387, + "grad_norm": 2.396371603012085, + "learning_rate": 1e-06, + "loss": 0.9332, + "mean_token_accuracy": 0.7113497853279114, + "num_tokens": 311042197.0, + "step": 12293 + }, + { + "epoch": 1.3500988359323522, + "grad_norm": 2.4198100566864014, + "learning_rate": 1e-06, + "loss": 0.9032, + "mean_token_accuracy": 0.715312123298645, + "num_tokens": 311064327.0, + "step": 12294 + }, + { + "epoch": 1.350208653634966, + "grad_norm": 2.498530626296997, + "learning_rate": 1e-06, + "loss": 0.8402, + "mean_token_accuracy": 0.7370093464851379, + "num_tokens": 311084478.0, + "step": 12295 + }, + { + "epoch": 1.3503184713375795, + "grad_norm": 2.2806525230407715, + "learning_rate": 1e-06, + "loss": 0.8726, + "mean_token_accuracy": 0.7310469150543213, + "num_tokens": 311110284.0, + "step": 12296 + }, + { + "epoch": 1.3504282890401933, + "grad_norm": 2.2845981121063232, + "learning_rate": 1e-06, + "loss": 0.9196, + "mean_token_accuracy": 0.7195172905921936, + "num_tokens": 311137737.0, + "step": 12297 + }, + { + "epoch": 1.3505381067428068, + "grad_norm": 1.944258451461792, + "learning_rate": 1e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7226129174232483, + "num_tokens": 311170323.0, + "step": 12298 + }, + { + "epoch": 1.3506479244454206, + "grad_norm": 2.1455812454223633, + "learning_rate": 1e-06, + "loss": 0.9449, + "mean_token_accuracy": 0.7066816687583923, + "num_tokens": 311197134.0, + "step": 12299 + }, + { + "epoch": 1.3507577421480343, + "grad_norm": 2.273210287094116, + "learning_rate": 1e-06, + "loss": 0.8648, + "mean_token_accuracy": 0.7328532934188843, + "num_tokens": 311221223.0, + "step": 12300 + }, + { + "epoch": 1.3508675598506479, + "grad_norm": 2.1185576915740967, + "learning_rate": 1e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.6927847266197205, + "num_tokens": 311249813.0, + "step": 12301 + }, + { + "epoch": 1.3509773775532616, + "grad_norm": 2.559828758239746, + "learning_rate": 1e-06, + "loss": 0.8785, + "mean_token_accuracy": 0.724075436592102, + "num_tokens": 311269404.0, + "step": 12302 + }, + { + "epoch": 1.3510871952558752, + "grad_norm": 2.1563596725463867, + "learning_rate": 1e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.7181591987609863, + "num_tokens": 311296603.0, + "step": 12303 + }, + { + "epoch": 1.351197012958489, + "grad_norm": 2.4551260471343994, + "learning_rate": 1e-06, + "loss": 0.86, + "mean_token_accuracy": 0.7261207103729248, + "num_tokens": 311319109.0, + "step": 12304 + }, + { + "epoch": 1.3513068306611027, + "grad_norm": 2.3787038326263428, + "learning_rate": 1e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.7093862295150757, + "num_tokens": 311342527.0, + "step": 12305 + }, + { + "epoch": 1.3514166483637162, + "grad_norm": 2.797361135482788, + "learning_rate": 1e-06, + "loss": 0.7579, + "mean_token_accuracy": 0.752246081829071, + "num_tokens": 311359436.0, + "step": 12306 + }, + { + "epoch": 1.3515264660663298, + "grad_norm": 2.149590492248535, + "learning_rate": 1e-06, + "loss": 1.015, + "mean_token_accuracy": 0.6873636841773987, + "num_tokens": 311386185.0, + "step": 12307 + }, + { + "epoch": 1.3516362837689435, + "grad_norm": 2.5255820751190186, + "learning_rate": 1e-06, + "loss": 0.8481, + "mean_token_accuracy": 0.731858491897583, + "num_tokens": 311406358.0, + "step": 12308 + }, + { + "epoch": 1.3517461014715573, + "grad_norm": 2.3384897708892822, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.713127613067627, + "num_tokens": 311431124.0, + "step": 12309 + }, + { + "epoch": 1.3518559191741708, + "grad_norm": 2.507598638534546, + "learning_rate": 1e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.7125132083892822, + "num_tokens": 311453424.0, + "step": 12310 + }, + { + "epoch": 1.3519657368767846, + "grad_norm": 2.3264529705047607, + "learning_rate": 1e-06, + "loss": 0.8658, + "mean_token_accuracy": 0.7262446880340576, + "num_tokens": 311478169.0, + "step": 12311 + }, + { + "epoch": 1.352075554579398, + "grad_norm": 2.2850966453552246, + "learning_rate": 1e-06, + "loss": 0.9001, + "mean_token_accuracy": 0.7221685647964478, + "num_tokens": 311501925.0, + "step": 12312 + }, + { + "epoch": 1.3521853722820119, + "grad_norm": 2.2118072509765625, + "learning_rate": 1e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.7181703448295593, + "num_tokens": 311527226.0, + "step": 12313 + }, + { + "epoch": 1.3522951899846256, + "grad_norm": 2.0793724060058594, + "learning_rate": 1e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.7293922305107117, + "num_tokens": 311555623.0, + "step": 12314 + }, + { + "epoch": 1.3524050076872391, + "grad_norm": 2.573516607284546, + "learning_rate": 1e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.7181880474090576, + "num_tokens": 311578236.0, + "step": 12315 + }, + { + "epoch": 1.352514825389853, + "grad_norm": 2.2671875953674316, + "learning_rate": 1e-06, + "loss": 0.8374, + "mean_token_accuracy": 0.7356691956520081, + "num_tokens": 311605842.0, + "step": 12316 + }, + { + "epoch": 1.3526246430924664, + "grad_norm": 2.248061418533325, + "learning_rate": 1e-06, + "loss": 0.8821, + "mean_token_accuracy": 0.7219101190567017, + "num_tokens": 311631884.0, + "step": 12317 + }, + { + "epoch": 1.3527344607950802, + "grad_norm": 2.273550033569336, + "learning_rate": 1e-06, + "loss": 0.8631, + "mean_token_accuracy": 0.7290025949478149, + "num_tokens": 311657880.0, + "step": 12318 + }, + { + "epoch": 1.352844278497694, + "grad_norm": 2.2286205291748047, + "learning_rate": 1e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.7123117446899414, + "num_tokens": 311683376.0, + "step": 12319 + }, + { + "epoch": 1.3529540962003075, + "grad_norm": 2.162100315093994, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.705154299736023, + "num_tokens": 311711150.0, + "step": 12320 + }, + { + "epoch": 1.353063913902921, + "grad_norm": 2.4447340965270996, + "learning_rate": 1e-06, + "loss": 0.86, + "mean_token_accuracy": 0.7241536974906921, + "num_tokens": 311735075.0, + "step": 12321 + }, + { + "epoch": 1.3531737316055348, + "grad_norm": 2.0629513263702393, + "learning_rate": 1e-06, + "loss": 0.8982, + "mean_token_accuracy": 0.7227378487586975, + "num_tokens": 311765184.0, + "step": 12322 + }, + { + "epoch": 1.3532835493081485, + "grad_norm": 2.383725881576538, + "learning_rate": 1e-06, + "loss": 0.8188, + "mean_token_accuracy": 0.738533616065979, + "num_tokens": 311787969.0, + "step": 12323 + }, + { + "epoch": 1.353393367010762, + "grad_norm": 2.3134474754333496, + "learning_rate": 1e-06, + "loss": 0.8538, + "mean_token_accuracy": 0.7305333614349365, + "num_tokens": 311810923.0, + "step": 12324 + }, + { + "epoch": 1.3535031847133758, + "grad_norm": 2.186401605606079, + "learning_rate": 1e-06, + "loss": 0.918, + "mean_token_accuracy": 0.7110592126846313, + "num_tokens": 311836030.0, + "step": 12325 + }, + { + "epoch": 1.3536130024159894, + "grad_norm": 2.4883880615234375, + "learning_rate": 1e-06, + "loss": 0.8786, + "mean_token_accuracy": 0.7228367924690247, + "num_tokens": 311859102.0, + "step": 12326 + }, + { + "epoch": 1.3537228201186031, + "grad_norm": 2.235305070877075, + "learning_rate": 1e-06, + "loss": 0.8587, + "mean_token_accuracy": 0.7280033826828003, + "num_tokens": 311883562.0, + "step": 12327 + }, + { + "epoch": 1.3538326378212169, + "grad_norm": 2.310455083847046, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.7152564525604248, + "num_tokens": 311907787.0, + "step": 12328 + }, + { + "epoch": 1.3539424555238304, + "grad_norm": 2.2115261554718018, + "learning_rate": 1e-06, + "loss": 0.8879, + "mean_token_accuracy": 0.7215858697891235, + "num_tokens": 311933926.0, + "step": 12329 + }, + { + "epoch": 1.3540522732264442, + "grad_norm": 2.1918082237243652, + "learning_rate": 1e-06, + "loss": 0.8551, + "mean_token_accuracy": 0.7303013801574707, + "num_tokens": 311959451.0, + "step": 12330 + }, + { + "epoch": 1.3541620909290577, + "grad_norm": 2.1528241634368896, + "learning_rate": 1e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.7145799994468689, + "num_tokens": 311987556.0, + "step": 12331 + }, + { + "epoch": 1.3542719086316715, + "grad_norm": 2.378159761428833, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7185148000717163, + "num_tokens": 312011573.0, + "step": 12332 + }, + { + "epoch": 1.3543817263342852, + "grad_norm": 2.350994110107422, + "learning_rate": 1e-06, + "loss": 0.8257, + "mean_token_accuracy": 0.7410871982574463, + "num_tokens": 312033619.0, + "step": 12333 + }, + { + "epoch": 1.3544915440368988, + "grad_norm": 2.2129955291748047, + "learning_rate": 1e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.7184498310089111, + "num_tokens": 312062257.0, + "step": 12334 + }, + { + "epoch": 1.3546013617395123, + "grad_norm": 2.0570731163024902, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7023787498474121, + "num_tokens": 312093158.0, + "step": 12335 + }, + { + "epoch": 1.354711179442126, + "grad_norm": 2.7599868774414062, + "learning_rate": 1e-06, + "loss": 0.8603, + "mean_token_accuracy": 0.734655499458313, + "num_tokens": 312111038.0, + "step": 12336 + }, + { + "epoch": 1.3548209971447398, + "grad_norm": 2.2876675128936768, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7091554403305054, + "num_tokens": 312137467.0, + "step": 12337 + }, + { + "epoch": 1.3549308148473533, + "grad_norm": 2.5362958908081055, + "learning_rate": 1e-06, + "loss": 0.8802, + "mean_token_accuracy": 0.719399631023407, + "num_tokens": 312157908.0, + "step": 12338 + }, + { + "epoch": 1.355040632549967, + "grad_norm": 2.0876007080078125, + "learning_rate": 1e-06, + "loss": 0.8775, + "mean_token_accuracy": 0.7261362075805664, + "num_tokens": 312187246.0, + "step": 12339 + }, + { + "epoch": 1.3551504502525806, + "grad_norm": 2.3824191093444824, + "learning_rate": 1e-06, + "loss": 0.9276, + "mean_token_accuracy": 0.7141127586364746, + "num_tokens": 312212084.0, + "step": 12340 + }, + { + "epoch": 1.3552602679551944, + "grad_norm": 2.160403251647949, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.705682098865509, + "num_tokens": 312238989.0, + "step": 12341 + }, + { + "epoch": 1.3553700856578081, + "grad_norm": 2.5229220390319824, + "learning_rate": 1e-06, + "loss": 0.8264, + "mean_token_accuracy": 0.7356287240982056, + "num_tokens": 312257993.0, + "step": 12342 + }, + { + "epoch": 1.3554799033604217, + "grad_norm": 2.2178781032562256, + "learning_rate": 1e-06, + "loss": 0.8847, + "mean_token_accuracy": 0.7313860654830933, + "num_tokens": 312284081.0, + "step": 12343 + }, + { + "epoch": 1.3555897210630354, + "grad_norm": 2.0710225105285645, + "learning_rate": 1e-06, + "loss": 0.9039, + "mean_token_accuracy": 0.7148818373680115, + "num_tokens": 312314145.0, + "step": 12344 + }, + { + "epoch": 1.355699538765649, + "grad_norm": 2.417405366897583, + "learning_rate": 1e-06, + "loss": 0.8545, + "mean_token_accuracy": 0.733283281326294, + "num_tokens": 312336223.0, + "step": 12345 + }, + { + "epoch": 1.3558093564682627, + "grad_norm": 2.423227310180664, + "learning_rate": 1e-06, + "loss": 0.8769, + "mean_token_accuracy": 0.7239660024642944, + "num_tokens": 312358651.0, + "step": 12346 + }, + { + "epoch": 1.3559191741708763, + "grad_norm": 2.5777101516723633, + "learning_rate": 1e-06, + "loss": 0.8076, + "mean_token_accuracy": 0.7411404252052307, + "num_tokens": 312377331.0, + "step": 12347 + }, + { + "epoch": 1.35602899187349, + "grad_norm": 2.289052724838257, + "learning_rate": 1e-06, + "loss": 0.88, + "mean_token_accuracy": 0.7265626192092896, + "num_tokens": 312401156.0, + "step": 12348 + }, + { + "epoch": 1.3561388095761036, + "grad_norm": 2.351534843444824, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.716381847858429, + "num_tokens": 312424867.0, + "step": 12349 + }, + { + "epoch": 1.3562486272787173, + "grad_norm": 2.2583370208740234, + "learning_rate": 1e-06, + "loss": 0.8618, + "mean_token_accuracy": 0.7332767248153687, + "num_tokens": 312451401.0, + "step": 12350 + }, + { + "epoch": 1.356358444981331, + "grad_norm": 2.1499125957489014, + "learning_rate": 1e-06, + "loss": 0.8962, + "mean_token_accuracy": 0.7121078968048096, + "num_tokens": 312478728.0, + "step": 12351 + }, + { + "epoch": 1.3564682626839446, + "grad_norm": 2.4042129516601562, + "learning_rate": 1e-06, + "loss": 0.7459, + "mean_token_accuracy": 0.7659410834312439, + "num_tokens": 312499234.0, + "step": 12352 + }, + { + "epoch": 1.3565780803865584, + "grad_norm": 2.111743688583374, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.6931189298629761, + "num_tokens": 312528027.0, + "step": 12353 + }, + { + "epoch": 1.356687898089172, + "grad_norm": 2.10396146774292, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7170552611351013, + "num_tokens": 312556766.0, + "step": 12354 + }, + { + "epoch": 1.3567977157917857, + "grad_norm": 2.1845109462738037, + "learning_rate": 1e-06, + "loss": 0.8574, + "mean_token_accuracy": 0.7328104972839355, + "num_tokens": 312583587.0, + "step": 12355 + }, + { + "epoch": 1.3569075334943994, + "grad_norm": 2.5594441890716553, + "learning_rate": 1e-06, + "loss": 0.8958, + "mean_token_accuracy": 0.7200665473937988, + "num_tokens": 312603950.0, + "step": 12356 + }, + { + "epoch": 1.357017351197013, + "grad_norm": 2.44549822807312, + "learning_rate": 1e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.7224504947662354, + "num_tokens": 312627793.0, + "step": 12357 + }, + { + "epoch": 1.3571271688996267, + "grad_norm": 2.2466211318969727, + "learning_rate": 1e-06, + "loss": 0.8809, + "mean_token_accuracy": 0.7297263145446777, + "num_tokens": 312654068.0, + "step": 12358 + }, + { + "epoch": 1.3572369866022402, + "grad_norm": 2.5495619773864746, + "learning_rate": 1e-06, + "loss": 0.8634, + "mean_token_accuracy": 0.7426326870918274, + "num_tokens": 312675108.0, + "step": 12359 + }, + { + "epoch": 1.357346804304854, + "grad_norm": 2.058262825012207, + "learning_rate": 1e-06, + "loss": 0.9809, + "mean_token_accuracy": 0.693484902381897, + "num_tokens": 312703329.0, + "step": 12360 + }, + { + "epoch": 1.3574566220074675, + "grad_norm": 2.3345232009887695, + "learning_rate": 1e-06, + "loss": 0.892, + "mean_token_accuracy": 0.71596759557724, + "num_tokens": 312727024.0, + "step": 12361 + }, + { + "epoch": 1.3575664397100813, + "grad_norm": 2.371342897415161, + "learning_rate": 1e-06, + "loss": 0.9808, + "mean_token_accuracy": 0.7053874731063843, + "num_tokens": 312752929.0, + "step": 12362 + }, + { + "epoch": 1.3576762574126948, + "grad_norm": 2.4210124015808105, + "learning_rate": 1e-06, + "loss": 0.8051, + "mean_token_accuracy": 0.7474715113639832, + "num_tokens": 312774601.0, + "step": 12363 + }, + { + "epoch": 1.3577860751153086, + "grad_norm": 2.3747191429138184, + "learning_rate": 1e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.7217862606048584, + "num_tokens": 312797341.0, + "step": 12364 + }, + { + "epoch": 1.3578958928179223, + "grad_norm": 2.206930160522461, + "learning_rate": 1e-06, + "loss": 0.7794, + "mean_token_accuracy": 0.75350022315979, + "num_tokens": 312822457.0, + "step": 12365 + }, + { + "epoch": 1.3580057105205359, + "grad_norm": 2.6538338661193848, + "learning_rate": 1e-06, + "loss": 0.8852, + "mean_token_accuracy": 0.7167741656303406, + "num_tokens": 312842488.0, + "step": 12366 + }, + { + "epoch": 1.3581155282231496, + "grad_norm": 2.2458009719848633, + "learning_rate": 1e-06, + "loss": 0.898, + "mean_token_accuracy": 0.7208279371261597, + "num_tokens": 312869446.0, + "step": 12367 + }, + { + "epoch": 1.3582253459257632, + "grad_norm": 2.4760842323303223, + "learning_rate": 1e-06, + "loss": 0.8221, + "mean_token_accuracy": 0.7434684634208679, + "num_tokens": 312889860.0, + "step": 12368 + }, + { + "epoch": 1.358335163628377, + "grad_norm": 2.5714645385742188, + "learning_rate": 1e-06, + "loss": 0.9348, + "mean_token_accuracy": 0.7066056728363037, + "num_tokens": 312911715.0, + "step": 12369 + }, + { + "epoch": 1.3584449813309907, + "grad_norm": 2.043978691101074, + "learning_rate": 1e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.6952812671661377, + "num_tokens": 312943138.0, + "step": 12370 + }, + { + "epoch": 1.3585547990336042, + "grad_norm": 2.1214635372161865, + "learning_rate": 1e-06, + "loss": 0.9079, + "mean_token_accuracy": 0.7145517468452454, + "num_tokens": 312971974.0, + "step": 12371 + }, + { + "epoch": 1.3586646167362177, + "grad_norm": 2.8166885375976562, + "learning_rate": 1e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.7147901058197021, + "num_tokens": 312992112.0, + "step": 12372 + }, + { + "epoch": 1.3587744344388315, + "grad_norm": 2.310817003250122, + "learning_rate": 1e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.6965255737304688, + "num_tokens": 313018355.0, + "step": 12373 + }, + { + "epoch": 1.3588842521414453, + "grad_norm": 2.1421561241149902, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.7044923305511475, + "num_tokens": 313043304.0, + "step": 12374 + }, + { + "epoch": 1.3589940698440588, + "grad_norm": 2.314419746398926, + "learning_rate": 1e-06, + "loss": 0.8302, + "mean_token_accuracy": 0.7350665330886841, + "num_tokens": 313066504.0, + "step": 12375 + }, + { + "epoch": 1.3591038875466726, + "grad_norm": 2.273822546005249, + "learning_rate": 1e-06, + "loss": 0.8684, + "mean_token_accuracy": 0.7294858694076538, + "num_tokens": 313092068.0, + "step": 12376 + }, + { + "epoch": 1.359213705249286, + "grad_norm": 2.2182698249816895, + "learning_rate": 1e-06, + "loss": 0.8507, + "mean_token_accuracy": 0.728013277053833, + "num_tokens": 313115546.0, + "step": 12377 + }, + { + "epoch": 1.3593235229518998, + "grad_norm": 2.394325017929077, + "learning_rate": 1e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7301130294799805, + "num_tokens": 313139061.0, + "step": 12378 + }, + { + "epoch": 1.3594333406545136, + "grad_norm": 2.2937121391296387, + "learning_rate": 1e-06, + "loss": 0.8049, + "mean_token_accuracy": 0.7431179285049438, + "num_tokens": 313162025.0, + "step": 12379 + }, + { + "epoch": 1.3595431583571271, + "grad_norm": 2.5253751277923584, + "learning_rate": 1e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.7154978513717651, + "num_tokens": 313183380.0, + "step": 12380 + }, + { + "epoch": 1.359652976059741, + "grad_norm": 2.1543805599212646, + "learning_rate": 1e-06, + "loss": 0.8812, + "mean_token_accuracy": 0.7257365584373474, + "num_tokens": 313211305.0, + "step": 12381 + }, + { + "epoch": 1.3597627937623544, + "grad_norm": 2.615969181060791, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.7169383764266968, + "num_tokens": 313230917.0, + "step": 12382 + }, + { + "epoch": 1.3598726114649682, + "grad_norm": 2.5301523208618164, + "learning_rate": 1e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.7000994086265564, + "num_tokens": 313254649.0, + "step": 12383 + }, + { + "epoch": 1.359982429167582, + "grad_norm": 2.4682021141052246, + "learning_rate": 1e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.7101588249206543, + "num_tokens": 313278154.0, + "step": 12384 + }, + { + "epoch": 1.3600922468701955, + "grad_norm": 2.3806264400482178, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7057502269744873, + "num_tokens": 313303127.0, + "step": 12385 + }, + { + "epoch": 1.360202064572809, + "grad_norm": 2.1309378147125244, + "learning_rate": 1e-06, + "loss": 1.0266, + "mean_token_accuracy": 0.6934229135513306, + "num_tokens": 313333641.0, + "step": 12386 + }, + { + "epoch": 1.3603118822754228, + "grad_norm": 2.4162397384643555, + "learning_rate": 1e-06, + "loss": 0.8697, + "mean_token_accuracy": 0.731804370880127, + "num_tokens": 313356405.0, + "step": 12387 + }, + { + "epoch": 1.3604216999780365, + "grad_norm": 2.1133456230163574, + "learning_rate": 1e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.7116050124168396, + "num_tokens": 313386012.0, + "step": 12388 + }, + { + "epoch": 1.36053151768065, + "grad_norm": 2.3122446537017822, + "learning_rate": 1e-06, + "loss": 0.8268, + "mean_token_accuracy": 0.7414909601211548, + "num_tokens": 313409359.0, + "step": 12389 + }, + { + "epoch": 1.3606413353832638, + "grad_norm": 1.9888352155685425, + "learning_rate": 1e-06, + "loss": 0.8296, + "mean_token_accuracy": 0.7455301284790039, + "num_tokens": 313440583.0, + "step": 12390 + }, + { + "epoch": 1.3607511530858774, + "grad_norm": 2.3420863151550293, + "learning_rate": 1e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.7171608209609985, + "num_tokens": 313462991.0, + "step": 12391 + }, + { + "epoch": 1.360860970788491, + "grad_norm": 2.192218780517578, + "learning_rate": 1e-06, + "loss": 1.0116, + "mean_token_accuracy": 0.695725679397583, + "num_tokens": 313489385.0, + "step": 12392 + }, + { + "epoch": 1.3609707884911049, + "grad_norm": 2.818286418914795, + "learning_rate": 1e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7021512985229492, + "num_tokens": 313516059.0, + "step": 12393 + }, + { + "epoch": 1.3610806061937184, + "grad_norm": 2.4727892875671387, + "learning_rate": 1e-06, + "loss": 0.8809, + "mean_token_accuracy": 0.7286129593849182, + "num_tokens": 313538102.0, + "step": 12394 + }, + { + "epoch": 1.3611904238963322, + "grad_norm": 2.279282808303833, + "learning_rate": 1e-06, + "loss": 0.8189, + "mean_token_accuracy": 0.7399864792823792, + "num_tokens": 313563168.0, + "step": 12395 + }, + { + "epoch": 1.3613002415989457, + "grad_norm": 2.384951591491699, + "learning_rate": 1e-06, + "loss": 0.8915, + "mean_token_accuracy": 0.7222621440887451, + "num_tokens": 313587102.0, + "step": 12396 + }, + { + "epoch": 1.3614100593015594, + "grad_norm": 2.1335649490356445, + "learning_rate": 1e-06, + "loss": 0.9953, + "mean_token_accuracy": 0.6973424553871155, + "num_tokens": 313615895.0, + "step": 12397 + }, + { + "epoch": 1.3615198770041732, + "grad_norm": 2.0557451248168945, + "learning_rate": 1e-06, + "loss": 0.953, + "mean_token_accuracy": 0.7074412107467651, + "num_tokens": 313646517.0, + "step": 12398 + }, + { + "epoch": 1.3616296947067867, + "grad_norm": 2.1658084392547607, + "learning_rate": 1e-06, + "loss": 1.0023, + "mean_token_accuracy": 0.6948245763778687, + "num_tokens": 313676080.0, + "step": 12399 + }, + { + "epoch": 1.3617395124094003, + "grad_norm": 2.1905667781829834, + "learning_rate": 1e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7131190299987793, + "num_tokens": 313703649.0, + "step": 12400 + }, + { + "epoch": 1.361849330112014, + "grad_norm": 2.5271551609039307, + "learning_rate": 1e-06, + "loss": 0.9479, + "mean_token_accuracy": 0.7115495204925537, + "num_tokens": 313724512.0, + "step": 12401 + }, + { + "epoch": 1.3619591478146278, + "grad_norm": 2.279392957687378, + "learning_rate": 1e-06, + "loss": 0.8681, + "mean_token_accuracy": 0.7257379293441772, + "num_tokens": 313747821.0, + "step": 12402 + }, + { + "epoch": 1.3620689655172413, + "grad_norm": 2.4469504356384277, + "learning_rate": 1e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.7054368257522583, + "num_tokens": 313770488.0, + "step": 12403 + }, + { + "epoch": 1.362178783219855, + "grad_norm": 2.233475923538208, + "learning_rate": 1e-06, + "loss": 0.8607, + "mean_token_accuracy": 0.7210044860839844, + "num_tokens": 313797506.0, + "step": 12404 + }, + { + "epoch": 1.3622886009224686, + "grad_norm": 2.2152979373931885, + "learning_rate": 1e-06, + "loss": 1.0006, + "mean_token_accuracy": 0.6945873498916626, + "num_tokens": 313826154.0, + "step": 12405 + }, + { + "epoch": 1.3623984186250824, + "grad_norm": 2.387557029724121, + "learning_rate": 1e-06, + "loss": 0.792, + "mean_token_accuracy": 0.7440062165260315, + "num_tokens": 313848654.0, + "step": 12406 + }, + { + "epoch": 1.3625082363276961, + "grad_norm": 2.4530725479125977, + "learning_rate": 1e-06, + "loss": 0.905, + "mean_token_accuracy": 0.7171483039855957, + "num_tokens": 313870964.0, + "step": 12407 + }, + { + "epoch": 1.3626180540303097, + "grad_norm": 2.5000410079956055, + "learning_rate": 1e-06, + "loss": 0.9037, + "mean_token_accuracy": 0.7184048891067505, + "num_tokens": 313893740.0, + "step": 12408 + }, + { + "epoch": 1.3627278717329234, + "grad_norm": 2.139225721359253, + "learning_rate": 1e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7275104522705078, + "num_tokens": 313921496.0, + "step": 12409 + }, + { + "epoch": 1.362837689435537, + "grad_norm": 2.292266368865967, + "learning_rate": 1e-06, + "loss": 0.9105, + "mean_token_accuracy": 0.7238450050354004, + "num_tokens": 313949469.0, + "step": 12410 + }, + { + "epoch": 1.3629475071381507, + "grad_norm": 2.323826313018799, + "learning_rate": 1e-06, + "loss": 0.8466, + "mean_token_accuracy": 0.7376435995101929, + "num_tokens": 313973364.0, + "step": 12411 + }, + { + "epoch": 1.3630573248407643, + "grad_norm": 2.109309196472168, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7086135149002075, + "num_tokens": 314000300.0, + "step": 12412 + }, + { + "epoch": 1.363167142543378, + "grad_norm": 2.3845620155334473, + "learning_rate": 1e-06, + "loss": 0.8467, + "mean_token_accuracy": 0.7309819459915161, + "num_tokens": 314022783.0, + "step": 12413 + }, + { + "epoch": 1.3632769602459915, + "grad_norm": 2.375633955001831, + "learning_rate": 1e-06, + "loss": 0.8533, + "mean_token_accuracy": 0.7338074445724487, + "num_tokens": 314044916.0, + "step": 12414 + }, + { + "epoch": 1.3633867779486053, + "grad_norm": 2.274639129638672, + "learning_rate": 1e-06, + "loss": 0.9166, + "mean_token_accuracy": 0.7101880311965942, + "num_tokens": 314069963.0, + "step": 12415 + }, + { + "epoch": 1.363496595651219, + "grad_norm": 2.246964931488037, + "learning_rate": 1e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.7004144191741943, + "num_tokens": 314097748.0, + "step": 12416 + }, + { + "epoch": 1.3636064133538326, + "grad_norm": 2.251695394515991, + "learning_rate": 1e-06, + "loss": 0.911, + "mean_token_accuracy": 0.7195674180984497, + "num_tokens": 314121892.0, + "step": 12417 + }, + { + "epoch": 1.3637162310564463, + "grad_norm": 1.9543163776397705, + "learning_rate": 1e-06, + "loss": 1.0168, + "mean_token_accuracy": 0.6811380982398987, + "num_tokens": 314155871.0, + "step": 12418 + }, + { + "epoch": 1.3638260487590599, + "grad_norm": 2.192753791809082, + "learning_rate": 1e-06, + "loss": 0.9274, + "mean_token_accuracy": 0.7118897438049316, + "num_tokens": 314186705.0, + "step": 12419 + }, + { + "epoch": 1.3639358664616736, + "grad_norm": 2.244800090789795, + "learning_rate": 1e-06, + "loss": 0.8568, + "mean_token_accuracy": 0.7277367115020752, + "num_tokens": 314213070.0, + "step": 12420 + }, + { + "epoch": 1.3640456841642874, + "grad_norm": 2.1768407821655273, + "learning_rate": 1e-06, + "loss": 0.8761, + "mean_token_accuracy": 0.7229888439178467, + "num_tokens": 314240401.0, + "step": 12421 + }, + { + "epoch": 1.364155501866901, + "grad_norm": 2.030811071395874, + "learning_rate": 1e-06, + "loss": 0.9679, + "mean_token_accuracy": 0.7040702104568481, + "num_tokens": 314271232.0, + "step": 12422 + }, + { + "epoch": 1.3642653195695145, + "grad_norm": 2.4354751110076904, + "learning_rate": 1e-06, + "loss": 0.8948, + "mean_token_accuracy": 0.7216857075691223, + "num_tokens": 314293598.0, + "step": 12423 + }, + { + "epoch": 1.3643751372721282, + "grad_norm": 2.1589131355285645, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7143647074699402, + "num_tokens": 314320661.0, + "step": 12424 + }, + { + "epoch": 1.364484954974742, + "grad_norm": 2.7661242485046387, + "learning_rate": 1e-06, + "loss": 0.8627, + "mean_token_accuracy": 0.7241524457931519, + "num_tokens": 314338972.0, + "step": 12425 + }, + { + "epoch": 1.3645947726773555, + "grad_norm": 2.3579790592193604, + "learning_rate": 1e-06, + "loss": 0.9709, + "mean_token_accuracy": 0.6967523694038391, + "num_tokens": 314365254.0, + "step": 12426 + }, + { + "epoch": 1.3647045903799693, + "grad_norm": 2.587096929550171, + "learning_rate": 1e-06, + "loss": 0.941, + "mean_token_accuracy": 0.7104888558387756, + "num_tokens": 314388437.0, + "step": 12427 + }, + { + "epoch": 1.3648144080825828, + "grad_norm": 2.180800676345825, + "learning_rate": 1e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.6952784657478333, + "num_tokens": 314418616.0, + "step": 12428 + }, + { + "epoch": 1.3649242257851966, + "grad_norm": 2.5029871463775635, + "learning_rate": 1e-06, + "loss": 0.8867, + "mean_token_accuracy": 0.7274512052536011, + "num_tokens": 314440705.0, + "step": 12429 + }, + { + "epoch": 1.3650340434878103, + "grad_norm": 2.008296251296997, + "learning_rate": 1e-06, + "loss": 0.8126, + "mean_token_accuracy": 0.7444241046905518, + "num_tokens": 314468073.0, + "step": 12430 + }, + { + "epoch": 1.3651438611904239, + "grad_norm": 2.01550030708313, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7072383165359497, + "num_tokens": 314498496.0, + "step": 12431 + }, + { + "epoch": 1.3652536788930376, + "grad_norm": 2.2503397464752197, + "learning_rate": 1e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7118651866912842, + "num_tokens": 314522668.0, + "step": 12432 + }, + { + "epoch": 1.3653634965956511, + "grad_norm": 2.2245330810546875, + "learning_rate": 1e-06, + "loss": 0.8598, + "mean_token_accuracy": 0.7258063554763794, + "num_tokens": 314546852.0, + "step": 12433 + }, + { + "epoch": 1.365473314298265, + "grad_norm": 2.353260040283203, + "learning_rate": 1e-06, + "loss": 0.8914, + "mean_token_accuracy": 0.7228493094444275, + "num_tokens": 314569432.0, + "step": 12434 + }, + { + "epoch": 1.3655831320008787, + "grad_norm": 2.257904529571533, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.705634355545044, + "num_tokens": 314594911.0, + "step": 12435 + }, + { + "epoch": 1.3656929497034922, + "grad_norm": 2.2396275997161865, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7161492109298706, + "num_tokens": 314619073.0, + "step": 12436 + }, + { + "epoch": 1.3658027674061057, + "grad_norm": 2.3560245037078857, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.6923208236694336, + "num_tokens": 314643754.0, + "step": 12437 + }, + { + "epoch": 1.3659125851087195, + "grad_norm": 2.34063458442688, + "learning_rate": 1e-06, + "loss": 0.8719, + "mean_token_accuracy": 0.7330721616744995, + "num_tokens": 314667582.0, + "step": 12438 + }, + { + "epoch": 1.3660224028113332, + "grad_norm": 2.510504961013794, + "learning_rate": 1e-06, + "loss": 0.889, + "mean_token_accuracy": 0.7226714491844177, + "num_tokens": 314689177.0, + "step": 12439 + }, + { + "epoch": 1.3661322205139468, + "grad_norm": 2.3662073612213135, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7087475061416626, + "num_tokens": 314714540.0, + "step": 12440 + }, + { + "epoch": 1.3662420382165605, + "grad_norm": 2.40199875831604, + "learning_rate": 1e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.7166418433189392, + "num_tokens": 314739287.0, + "step": 12441 + }, + { + "epoch": 1.366351855919174, + "grad_norm": 2.2389488220214844, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.6986047625541687, + "num_tokens": 314765796.0, + "step": 12442 + }, + { + "epoch": 1.3664616736217878, + "grad_norm": 2.318129539489746, + "learning_rate": 1e-06, + "loss": 0.8492, + "mean_token_accuracy": 0.7363965511322021, + "num_tokens": 314788501.0, + "step": 12443 + }, + { + "epoch": 1.3665714913244016, + "grad_norm": 2.608492136001587, + "learning_rate": 1e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7218144536018372, + "num_tokens": 314808051.0, + "step": 12444 + }, + { + "epoch": 1.3666813090270151, + "grad_norm": 2.3566267490386963, + "learning_rate": 1e-06, + "loss": 0.8189, + "mean_token_accuracy": 0.7359800934791565, + "num_tokens": 314830085.0, + "step": 12445 + }, + { + "epoch": 1.3667911267296289, + "grad_norm": 2.271491765975952, + "learning_rate": 1e-06, + "loss": 0.8647, + "mean_token_accuracy": 0.7237844467163086, + "num_tokens": 314855280.0, + "step": 12446 + }, + { + "epoch": 1.3669009444322424, + "grad_norm": 2.7034683227539062, + "learning_rate": 1e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.7091309428215027, + "num_tokens": 314874753.0, + "step": 12447 + }, + { + "epoch": 1.3670107621348562, + "grad_norm": 2.5888712406158447, + "learning_rate": 1e-06, + "loss": 0.7742, + "mean_token_accuracy": 0.7470101118087769, + "num_tokens": 314893945.0, + "step": 12448 + }, + { + "epoch": 1.36712057983747, + "grad_norm": 2.2943384647369385, + "learning_rate": 1e-06, + "loss": 0.9025, + "mean_token_accuracy": 0.7311564087867737, + "num_tokens": 314917068.0, + "step": 12449 + }, + { + "epoch": 1.3672303975400835, + "grad_norm": 2.385312080383301, + "learning_rate": 1e-06, + "loss": 0.842, + "mean_token_accuracy": 0.7312202453613281, + "num_tokens": 314938480.0, + "step": 12450 + }, + { + "epoch": 1.367340215242697, + "grad_norm": 2.400381326675415, + "learning_rate": 1e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.7243983745574951, + "num_tokens": 314961117.0, + "step": 12451 + }, + { + "epoch": 1.3674500329453108, + "grad_norm": 2.4730584621429443, + "learning_rate": 1e-06, + "loss": 0.8651, + "mean_token_accuracy": 0.7248833179473877, + "num_tokens": 314983333.0, + "step": 12452 + }, + { + "epoch": 1.3675598506479245, + "grad_norm": 2.197700023651123, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7056891918182373, + "num_tokens": 315010943.0, + "step": 12453 + }, + { + "epoch": 1.367669668350538, + "grad_norm": 2.069221019744873, + "learning_rate": 1e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.7246053218841553, + "num_tokens": 315040064.0, + "step": 12454 + }, + { + "epoch": 1.3677794860531518, + "grad_norm": 2.357452630996704, + "learning_rate": 1e-06, + "loss": 0.9281, + "mean_token_accuracy": 0.7162102460861206, + "num_tokens": 315063198.0, + "step": 12455 + }, + { + "epoch": 1.3678893037557653, + "grad_norm": 2.1025454998016357, + "learning_rate": 1e-06, + "loss": 0.9596, + "mean_token_accuracy": 0.6959929466247559, + "num_tokens": 315095780.0, + "step": 12456 + }, + { + "epoch": 1.367999121458379, + "grad_norm": 2.0452980995178223, + "learning_rate": 1e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.7143727540969849, + "num_tokens": 315126010.0, + "step": 12457 + }, + { + "epoch": 1.3681089391609929, + "grad_norm": 2.0267174243927, + "learning_rate": 1e-06, + "loss": 0.8519, + "mean_token_accuracy": 0.7301836013793945, + "num_tokens": 315155714.0, + "step": 12458 + }, + { + "epoch": 1.3682187568636064, + "grad_norm": 2.4145259857177734, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7057521343231201, + "num_tokens": 315179209.0, + "step": 12459 + }, + { + "epoch": 1.3683285745662201, + "grad_norm": 2.374175786972046, + "learning_rate": 1e-06, + "loss": 0.8378, + "mean_token_accuracy": 0.7340755462646484, + "num_tokens": 315203003.0, + "step": 12460 + }, + { + "epoch": 1.3684383922688337, + "grad_norm": 2.166841983795166, + "learning_rate": 1e-06, + "loss": 1.0542, + "mean_token_accuracy": 0.680648922920227, + "num_tokens": 315230930.0, + "step": 12461 + }, + { + "epoch": 1.3685482099714474, + "grad_norm": 2.428821325302124, + "learning_rate": 1e-06, + "loss": 0.9025, + "mean_token_accuracy": 0.7144409418106079, + "num_tokens": 315254251.0, + "step": 12462 + }, + { + "epoch": 1.3686580276740612, + "grad_norm": 2.1120355129241943, + "learning_rate": 1e-06, + "loss": 0.8847, + "mean_token_accuracy": 0.7243858575820923, + "num_tokens": 315280911.0, + "step": 12463 + }, + { + "epoch": 1.3687678453766747, + "grad_norm": 2.162641763687134, + "learning_rate": 1e-06, + "loss": 0.8892, + "mean_token_accuracy": 0.7207353115081787, + "num_tokens": 315307863.0, + "step": 12464 + }, + { + "epoch": 1.3688776630792883, + "grad_norm": 2.330805778503418, + "learning_rate": 1e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.7055824398994446, + "num_tokens": 315332861.0, + "step": 12465 + }, + { + "epoch": 1.368987480781902, + "grad_norm": 2.7128772735595703, + "learning_rate": 1e-06, + "loss": 0.843, + "mean_token_accuracy": 0.7288893461227417, + "num_tokens": 315351578.0, + "step": 12466 + }, + { + "epoch": 1.3690972984845158, + "grad_norm": 2.253324270248413, + "learning_rate": 1e-06, + "loss": 0.8836, + "mean_token_accuracy": 0.727614164352417, + "num_tokens": 315374768.0, + "step": 12467 + }, + { + "epoch": 1.3692071161871293, + "grad_norm": 2.2799627780914307, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7093057036399841, + "num_tokens": 315401014.0, + "step": 12468 + }, + { + "epoch": 1.369316933889743, + "grad_norm": 2.3565735816955566, + "learning_rate": 1e-06, + "loss": 0.8585, + "mean_token_accuracy": 0.7339684367179871, + "num_tokens": 315424486.0, + "step": 12469 + }, + { + "epoch": 1.3694267515923566, + "grad_norm": 2.0162906646728516, + "learning_rate": 1e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.7156422138214111, + "num_tokens": 315458551.0, + "step": 12470 + }, + { + "epoch": 1.3695365692949704, + "grad_norm": 2.172760009765625, + "learning_rate": 1e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.714388370513916, + "num_tokens": 315486068.0, + "step": 12471 + }, + { + "epoch": 1.3696463869975841, + "grad_norm": 2.1373488903045654, + "learning_rate": 1e-06, + "loss": 0.8967, + "mean_token_accuracy": 0.7236706018447876, + "num_tokens": 315511746.0, + "step": 12472 + }, + { + "epoch": 1.3697562047001977, + "grad_norm": 2.1744349002838135, + "learning_rate": 1e-06, + "loss": 0.8795, + "mean_token_accuracy": 0.7220228910446167, + "num_tokens": 315538723.0, + "step": 12473 + }, + { + "epoch": 1.3698660224028114, + "grad_norm": 2.33025860786438, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7091907262802124, + "num_tokens": 315563641.0, + "step": 12474 + }, + { + "epoch": 1.369975840105425, + "grad_norm": 2.0804200172424316, + "learning_rate": 1e-06, + "loss": 0.8938, + "mean_token_accuracy": 0.7175675630569458, + "num_tokens": 315590476.0, + "step": 12475 + }, + { + "epoch": 1.3700856578080387, + "grad_norm": 2.382828712463379, + "learning_rate": 1e-06, + "loss": 0.877, + "mean_token_accuracy": 0.7294288277626038, + "num_tokens": 315613506.0, + "step": 12476 + }, + { + "epoch": 1.3701954755106522, + "grad_norm": 2.334709405899048, + "learning_rate": 1e-06, + "loss": 0.8738, + "mean_token_accuracy": 0.7179722785949707, + "num_tokens": 315638217.0, + "step": 12477 + }, + { + "epoch": 1.370305293213266, + "grad_norm": 2.065000295639038, + "learning_rate": 1e-06, + "loss": 0.8012, + "mean_token_accuracy": 0.7395318150520325, + "num_tokens": 315667123.0, + "step": 12478 + }, + { + "epoch": 1.3704151109158795, + "grad_norm": 2.2733259201049805, + "learning_rate": 1e-06, + "loss": 0.898, + "mean_token_accuracy": 0.717242956161499, + "num_tokens": 315691380.0, + "step": 12479 + }, + { + "epoch": 1.3705249286184933, + "grad_norm": 2.3012542724609375, + "learning_rate": 1e-06, + "loss": 0.8495, + "mean_token_accuracy": 0.7363309264183044, + "num_tokens": 315714517.0, + "step": 12480 + }, + { + "epoch": 1.370634746321107, + "grad_norm": 2.3279471397399902, + "learning_rate": 1e-06, + "loss": 0.9006, + "mean_token_accuracy": 0.7226462364196777, + "num_tokens": 315737273.0, + "step": 12481 + }, + { + "epoch": 1.3707445640237206, + "grad_norm": 2.186326503753662, + "learning_rate": 1e-06, + "loss": 0.937, + "mean_token_accuracy": 0.7120790481567383, + "num_tokens": 315764536.0, + "step": 12482 + }, + { + "epoch": 1.3708543817263343, + "grad_norm": 2.257514715194702, + "learning_rate": 1e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.7177494764328003, + "num_tokens": 315791790.0, + "step": 12483 + }, + { + "epoch": 1.3709641994289479, + "grad_norm": 2.0869903564453125, + "learning_rate": 1e-06, + "loss": 0.8446, + "mean_token_accuracy": 0.7323903441429138, + "num_tokens": 315820789.0, + "step": 12484 + }, + { + "epoch": 1.3710740171315616, + "grad_norm": 2.342308282852173, + "learning_rate": 1e-06, + "loss": 0.9205, + "mean_token_accuracy": 0.7108104228973389, + "num_tokens": 315844977.0, + "step": 12485 + }, + { + "epoch": 1.3711838348341754, + "grad_norm": 2.221039056777954, + "learning_rate": 1e-06, + "loss": 0.8103, + "mean_token_accuracy": 0.742992103099823, + "num_tokens": 315870139.0, + "step": 12486 + }, + { + "epoch": 1.371293652536789, + "grad_norm": 2.1300203800201416, + "learning_rate": 1e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.7282038331031799, + "num_tokens": 315896996.0, + "step": 12487 + }, + { + "epoch": 1.3714034702394025, + "grad_norm": 2.2120649814605713, + "learning_rate": 1e-06, + "loss": 0.9079, + "mean_token_accuracy": 0.7134220004081726, + "num_tokens": 315924135.0, + "step": 12488 + }, + { + "epoch": 1.3715132879420162, + "grad_norm": 2.27445125579834, + "learning_rate": 1e-06, + "loss": 0.8434, + "mean_token_accuracy": 0.73442542552948, + "num_tokens": 315948389.0, + "step": 12489 + }, + { + "epoch": 1.37162310564463, + "grad_norm": 2.47188401222229, + "learning_rate": 1e-06, + "loss": 0.8947, + "mean_token_accuracy": 0.7234777212142944, + "num_tokens": 315970266.0, + "step": 12490 + }, + { + "epoch": 1.3717329233472435, + "grad_norm": 2.3490097522735596, + "learning_rate": 1e-06, + "loss": 0.8882, + "mean_token_accuracy": 0.7253568172454834, + "num_tokens": 315993560.0, + "step": 12491 + }, + { + "epoch": 1.3718427410498573, + "grad_norm": 2.2003304958343506, + "learning_rate": 1e-06, + "loss": 0.8373, + "mean_token_accuracy": 0.7399747967720032, + "num_tokens": 316018567.0, + "step": 12492 + }, + { + "epoch": 1.3719525587524708, + "grad_norm": 2.434727668762207, + "learning_rate": 1e-06, + "loss": 0.8876, + "mean_token_accuracy": 0.7289738059043884, + "num_tokens": 316041943.0, + "step": 12493 + }, + { + "epoch": 1.3720623764550846, + "grad_norm": 2.431541681289673, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.707816481590271, + "num_tokens": 316065720.0, + "step": 12494 + }, + { + "epoch": 1.3721721941576983, + "grad_norm": 2.4584743976593018, + "learning_rate": 1e-06, + "loss": 0.821, + "mean_token_accuracy": 0.742641806602478, + "num_tokens": 316087661.0, + "step": 12495 + }, + { + "epoch": 1.3722820118603118, + "grad_norm": 2.3259708881378174, + "learning_rate": 1e-06, + "loss": 0.8989, + "mean_token_accuracy": 0.7189769148826599, + "num_tokens": 316111397.0, + "step": 12496 + }, + { + "epoch": 1.3723918295629256, + "grad_norm": 2.0885379314422607, + "learning_rate": 1e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.714367687702179, + "num_tokens": 316141173.0, + "step": 12497 + }, + { + "epoch": 1.3725016472655391, + "grad_norm": 2.1160528659820557, + "learning_rate": 1e-06, + "loss": 0.8978, + "mean_token_accuracy": 0.727413535118103, + "num_tokens": 316166949.0, + "step": 12498 + }, + { + "epoch": 1.372611464968153, + "grad_norm": 2.318901777267456, + "learning_rate": 1e-06, + "loss": 0.7877, + "mean_token_accuracy": 0.7466932535171509, + "num_tokens": 316189715.0, + "step": 12499 + }, + { + "epoch": 1.3727212826707667, + "grad_norm": 2.1936724185943604, + "learning_rate": 1e-06, + "loss": 0.8982, + "mean_token_accuracy": 0.7193650007247925, + "num_tokens": 316218860.0, + "step": 12500 + }, + { + "epoch": 1.3728311003733802, + "grad_norm": 2.7613437175750732, + "learning_rate": 1e-06, + "loss": 0.8688, + "mean_token_accuracy": 0.7202908992767334, + "num_tokens": 316236766.0, + "step": 12501 + }, + { + "epoch": 1.3729409180759937, + "grad_norm": 2.448305368423462, + "learning_rate": 1e-06, + "loss": 0.9013, + "mean_token_accuracy": 0.7189227342605591, + "num_tokens": 316257477.0, + "step": 12502 + }, + { + "epoch": 1.3730507357786075, + "grad_norm": 2.3982086181640625, + "learning_rate": 1e-06, + "loss": 0.8755, + "mean_token_accuracy": 0.7211270928382874, + "num_tokens": 316279513.0, + "step": 12503 + }, + { + "epoch": 1.3731605534812212, + "grad_norm": 2.0790109634399414, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7113544940948486, + "num_tokens": 316309753.0, + "step": 12504 + }, + { + "epoch": 1.3732703711838348, + "grad_norm": 2.225391387939453, + "learning_rate": 1e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.708228349685669, + "num_tokens": 316337168.0, + "step": 12505 + }, + { + "epoch": 1.3733801888864485, + "grad_norm": 1.9835361242294312, + "learning_rate": 1e-06, + "loss": 0.8964, + "mean_token_accuracy": 0.7229299545288086, + "num_tokens": 316368745.0, + "step": 12506 + }, + { + "epoch": 1.373490006589062, + "grad_norm": 2.382052183151245, + "learning_rate": 1e-06, + "loss": 0.8892, + "mean_token_accuracy": 0.7186034917831421, + "num_tokens": 316391444.0, + "step": 12507 + }, + { + "epoch": 1.3735998242916758, + "grad_norm": 2.106785774230957, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7139354944229126, + "num_tokens": 316418305.0, + "step": 12508 + }, + { + "epoch": 1.3737096419942896, + "grad_norm": 2.164956569671631, + "learning_rate": 1e-06, + "loss": 0.8385, + "mean_token_accuracy": 0.7417899370193481, + "num_tokens": 316444485.0, + "step": 12509 + }, + { + "epoch": 1.3738194596969031, + "grad_norm": 2.4012537002563477, + "learning_rate": 1e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.7117096781730652, + "num_tokens": 316467916.0, + "step": 12510 + }, + { + "epoch": 1.3739292773995169, + "grad_norm": 2.4287381172180176, + "learning_rate": 1e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.7209497094154358, + "num_tokens": 316489883.0, + "step": 12511 + }, + { + "epoch": 1.3740390951021304, + "grad_norm": 2.0822315216064453, + "learning_rate": 1e-06, + "loss": 0.9982, + "mean_token_accuracy": 0.7046195268630981, + "num_tokens": 316519635.0, + "step": 12512 + }, + { + "epoch": 1.3741489128047442, + "grad_norm": 2.264054775238037, + "learning_rate": 1e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.6838347315788269, + "num_tokens": 316546568.0, + "step": 12513 + }, + { + "epoch": 1.374258730507358, + "grad_norm": 2.087440013885498, + "learning_rate": 1e-06, + "loss": 1.047, + "mean_token_accuracy": 0.6833181381225586, + "num_tokens": 316577985.0, + "step": 12514 + }, + { + "epoch": 1.3743685482099715, + "grad_norm": 2.3029580116271973, + "learning_rate": 1e-06, + "loss": 0.9029, + "mean_token_accuracy": 0.720676064491272, + "num_tokens": 316600536.0, + "step": 12515 + }, + { + "epoch": 1.374478365912585, + "grad_norm": 2.2998690605163574, + "learning_rate": 1e-06, + "loss": 0.8911, + "mean_token_accuracy": 0.7252713441848755, + "num_tokens": 316625566.0, + "step": 12516 + }, + { + "epoch": 1.3745881836151987, + "grad_norm": 2.103604555130005, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7005869150161743, + "num_tokens": 316655121.0, + "step": 12517 + }, + { + "epoch": 1.3746980013178125, + "grad_norm": 2.5119452476501465, + "learning_rate": 1e-06, + "loss": 0.9087, + "mean_token_accuracy": 0.7267130017280579, + "num_tokens": 316676346.0, + "step": 12518 + }, + { + "epoch": 1.374807819020426, + "grad_norm": 2.1499805450439453, + "learning_rate": 1e-06, + "loss": 0.9034, + "mean_token_accuracy": 0.7238362431526184, + "num_tokens": 316705584.0, + "step": 12519 + }, + { + "epoch": 1.3749176367230398, + "grad_norm": 2.0107951164245605, + "learning_rate": 1e-06, + "loss": 0.9143, + "mean_token_accuracy": 0.7188006043434143, + "num_tokens": 316735897.0, + "step": 12520 + }, + { + "epoch": 1.3750274544256533, + "grad_norm": 2.3235790729522705, + "learning_rate": 1e-06, + "loss": 0.8334, + "mean_token_accuracy": 0.7345033884048462, + "num_tokens": 316758648.0, + "step": 12521 + }, + { + "epoch": 1.375137272128267, + "grad_norm": 2.304286241531372, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7256026268005371, + "num_tokens": 316781225.0, + "step": 12522 + }, + { + "epoch": 1.3752470898308808, + "grad_norm": 2.186415195465088, + "learning_rate": 1e-06, + "loss": 1.0209, + "mean_token_accuracy": 0.7027453184127808, + "num_tokens": 316806862.0, + "step": 12523 + }, + { + "epoch": 1.3753569075334944, + "grad_norm": 2.463103771209717, + "learning_rate": 1e-06, + "loss": 0.8651, + "mean_token_accuracy": 0.7227013111114502, + "num_tokens": 316829840.0, + "step": 12524 + }, + { + "epoch": 1.3754667252361081, + "grad_norm": 2.215989589691162, + "learning_rate": 1e-06, + "loss": 0.8041, + "mean_token_accuracy": 0.7451215386390686, + "num_tokens": 316855076.0, + "step": 12525 + }, + { + "epoch": 1.3755765429387217, + "grad_norm": 2.2605886459350586, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.7042381763458252, + "num_tokens": 316880543.0, + "step": 12526 + }, + { + "epoch": 1.3756863606413354, + "grad_norm": 1.9845446348190308, + "learning_rate": 1e-06, + "loss": 0.7833, + "mean_token_accuracy": 0.7538206577301025, + "num_tokens": 316909485.0, + "step": 12527 + }, + { + "epoch": 1.3757961783439492, + "grad_norm": 2.1278817653656006, + "learning_rate": 1e-06, + "loss": 0.9239, + "mean_token_accuracy": 0.7091050148010254, + "num_tokens": 316937893.0, + "step": 12528 + }, + { + "epoch": 1.3759059960465627, + "grad_norm": 2.447993040084839, + "learning_rate": 1e-06, + "loss": 0.8732, + "mean_token_accuracy": 0.72266685962677, + "num_tokens": 316961674.0, + "step": 12529 + }, + { + "epoch": 1.3760158137491763, + "grad_norm": 2.3455965518951416, + "learning_rate": 1e-06, + "loss": 0.8493, + "mean_token_accuracy": 0.7368713617324829, + "num_tokens": 316984160.0, + "step": 12530 + }, + { + "epoch": 1.37612563145179, + "grad_norm": 2.3090660572052, + "learning_rate": 1e-06, + "loss": 0.8627, + "mean_token_accuracy": 0.7287981510162354, + "num_tokens": 317009795.0, + "step": 12531 + }, + { + "epoch": 1.3762354491544038, + "grad_norm": 2.3332319259643555, + "learning_rate": 1e-06, + "loss": 0.814, + "mean_token_accuracy": 0.7424737215042114, + "num_tokens": 317032959.0, + "step": 12532 + }, + { + "epoch": 1.3763452668570173, + "grad_norm": 2.0691545009613037, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7018254995346069, + "num_tokens": 317064127.0, + "step": 12533 + }, + { + "epoch": 1.376455084559631, + "grad_norm": 2.0300302505493164, + "learning_rate": 1e-06, + "loss": 0.8611, + "mean_token_accuracy": 0.7302138209342957, + "num_tokens": 317092764.0, + "step": 12534 + }, + { + "epoch": 1.3765649022622446, + "grad_norm": 2.3062729835510254, + "learning_rate": 1e-06, + "loss": 0.8602, + "mean_token_accuracy": 0.7353741526603699, + "num_tokens": 317115581.0, + "step": 12535 + }, + { + "epoch": 1.3766747199648584, + "grad_norm": 2.3173861503601074, + "learning_rate": 1e-06, + "loss": 0.8694, + "mean_token_accuracy": 0.7289806604385376, + "num_tokens": 317139538.0, + "step": 12536 + }, + { + "epoch": 1.376784537667472, + "grad_norm": 2.240102767944336, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7106900215148926, + "num_tokens": 317165736.0, + "step": 12537 + }, + { + "epoch": 1.3768943553700856, + "grad_norm": 2.0799245834350586, + "learning_rate": 1e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7096549272537231, + "num_tokens": 317194017.0, + "step": 12538 + }, + { + "epoch": 1.3770041730726994, + "grad_norm": 2.199892520904541, + "learning_rate": 1e-06, + "loss": 0.9323, + "mean_token_accuracy": 0.7046186923980713, + "num_tokens": 317221054.0, + "step": 12539 + }, + { + "epoch": 1.377113990775313, + "grad_norm": 2.206373929977417, + "learning_rate": 1e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.7052810788154602, + "num_tokens": 317249337.0, + "step": 12540 + }, + { + "epoch": 1.3772238084779267, + "grad_norm": 2.3803155422210693, + "learning_rate": 1e-06, + "loss": 0.8592, + "mean_token_accuracy": 0.7414956092834473, + "num_tokens": 317272912.0, + "step": 12541 + }, + { + "epoch": 1.3773336261805402, + "grad_norm": 2.1818430423736572, + "learning_rate": 1e-06, + "loss": 0.7438, + "mean_token_accuracy": 0.7663591504096985, + "num_tokens": 317296640.0, + "step": 12542 + }, + { + "epoch": 1.377443443883154, + "grad_norm": 2.208655834197998, + "learning_rate": 1e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7187058925628662, + "num_tokens": 317324676.0, + "step": 12543 + }, + { + "epoch": 1.3775532615857675, + "grad_norm": 2.4283676147460938, + "learning_rate": 1e-06, + "loss": 0.8413, + "mean_token_accuracy": 0.7375655174255371, + "num_tokens": 317346168.0, + "step": 12544 + }, + { + "epoch": 1.3776630792883813, + "grad_norm": 2.439418315887451, + "learning_rate": 1e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7064493298530579, + "num_tokens": 317368312.0, + "step": 12545 + }, + { + "epoch": 1.377772896990995, + "grad_norm": 2.152007579803467, + "learning_rate": 1e-06, + "loss": 0.8784, + "mean_token_accuracy": 0.7265782356262207, + "num_tokens": 317395316.0, + "step": 12546 + }, + { + "epoch": 1.3778827146936086, + "grad_norm": 2.041386842727661, + "learning_rate": 1e-06, + "loss": 0.9874, + "mean_token_accuracy": 0.697766900062561, + "num_tokens": 317426921.0, + "step": 12547 + }, + { + "epoch": 1.3779925323962223, + "grad_norm": 2.30918550491333, + "learning_rate": 1e-06, + "loss": 0.7921, + "mean_token_accuracy": 0.7483816146850586, + "num_tokens": 317449902.0, + "step": 12548 + }, + { + "epoch": 1.3781023500988359, + "grad_norm": 2.418057441711426, + "learning_rate": 1e-06, + "loss": 0.8751, + "mean_token_accuracy": 0.7233035564422607, + "num_tokens": 317470343.0, + "step": 12549 + }, + { + "epoch": 1.3782121678014496, + "grad_norm": 2.2820229530334473, + "learning_rate": 1e-06, + "loss": 0.8671, + "mean_token_accuracy": 0.7320888042449951, + "num_tokens": 317494044.0, + "step": 12550 + }, + { + "epoch": 1.3783219855040634, + "grad_norm": 2.4074811935424805, + "learning_rate": 1e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.7071269750595093, + "num_tokens": 317519398.0, + "step": 12551 + }, + { + "epoch": 1.378431803206677, + "grad_norm": 2.475874423980713, + "learning_rate": 1e-06, + "loss": 0.7929, + "mean_token_accuracy": 0.7520847320556641, + "num_tokens": 317541683.0, + "step": 12552 + }, + { + "epoch": 1.3785416209092904, + "grad_norm": 2.4700028896331787, + "learning_rate": 1e-06, + "loss": 0.9661, + "mean_token_accuracy": 0.7115136981010437, + "num_tokens": 317566132.0, + "step": 12553 + }, + { + "epoch": 1.3786514386119042, + "grad_norm": 2.2944576740264893, + "learning_rate": 1e-06, + "loss": 0.8229, + "mean_token_accuracy": 0.7519473433494568, + "num_tokens": 317589039.0, + "step": 12554 + }, + { + "epoch": 1.378761256314518, + "grad_norm": 2.8625943660736084, + "learning_rate": 1e-06, + "loss": 0.8323, + "mean_token_accuracy": 0.7371115684509277, + "num_tokens": 317606760.0, + "step": 12555 + }, + { + "epoch": 1.3788710740171315, + "grad_norm": 2.478107452392578, + "learning_rate": 1e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.7172161936759949, + "num_tokens": 317629776.0, + "step": 12556 + }, + { + "epoch": 1.3789808917197452, + "grad_norm": 2.2890465259552, + "learning_rate": 1e-06, + "loss": 0.86, + "mean_token_accuracy": 0.7327528595924377, + "num_tokens": 317654031.0, + "step": 12557 + }, + { + "epoch": 1.3790907094223588, + "grad_norm": 2.3891115188598633, + "learning_rate": 1e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.709349513053894, + "num_tokens": 317678280.0, + "step": 12558 + }, + { + "epoch": 1.3792005271249725, + "grad_norm": 2.302490711212158, + "learning_rate": 1e-06, + "loss": 0.7846, + "mean_token_accuracy": 0.7464348077774048, + "num_tokens": 317702165.0, + "step": 12559 + }, + { + "epoch": 1.3793103448275863, + "grad_norm": 2.353797435760498, + "learning_rate": 1e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.7237457036972046, + "num_tokens": 317726968.0, + "step": 12560 + }, + { + "epoch": 1.3794201625301998, + "grad_norm": 2.4751245975494385, + "learning_rate": 1e-06, + "loss": 0.8489, + "mean_token_accuracy": 0.732943058013916, + "num_tokens": 317748191.0, + "step": 12561 + }, + { + "epoch": 1.3795299802328136, + "grad_norm": 2.282107353210449, + "learning_rate": 1e-06, + "loss": 0.888, + "mean_token_accuracy": 0.7245851159095764, + "num_tokens": 317772625.0, + "step": 12562 + }, + { + "epoch": 1.3796397979354271, + "grad_norm": 2.1702682971954346, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.6998591423034668, + "num_tokens": 317802225.0, + "step": 12563 + }, + { + "epoch": 1.3797496156380409, + "grad_norm": 2.097961902618408, + "learning_rate": 1e-06, + "loss": 0.9628, + "mean_token_accuracy": 0.7008614540100098, + "num_tokens": 317831944.0, + "step": 12564 + }, + { + "epoch": 1.3798594333406546, + "grad_norm": 2.1556522846221924, + "learning_rate": 1e-06, + "loss": 0.7656, + "mean_token_accuracy": 0.7557821273803711, + "num_tokens": 317858314.0, + "step": 12565 + }, + { + "epoch": 1.3799692510432682, + "grad_norm": 2.1181907653808594, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7097270488739014, + "num_tokens": 317887234.0, + "step": 12566 + }, + { + "epoch": 1.3800790687458817, + "grad_norm": 2.4931366443634033, + "learning_rate": 1e-06, + "loss": 0.8427, + "mean_token_accuracy": 0.7322165966033936, + "num_tokens": 317907676.0, + "step": 12567 + }, + { + "epoch": 1.3801888864484955, + "grad_norm": 2.1764116287231445, + "learning_rate": 1e-06, + "loss": 0.9709, + "mean_token_accuracy": 0.7023371458053589, + "num_tokens": 317935682.0, + "step": 12568 + }, + { + "epoch": 1.3802987041511092, + "grad_norm": 2.230748176574707, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.705815315246582, + "num_tokens": 317960930.0, + "step": 12569 + }, + { + "epoch": 1.3804085218537228, + "grad_norm": 1.9455784559249878, + "learning_rate": 1e-06, + "loss": 0.849, + "mean_token_accuracy": 0.7392163276672363, + "num_tokens": 317993634.0, + "step": 12570 + }, + { + "epoch": 1.3805183395563365, + "grad_norm": 2.2880842685699463, + "learning_rate": 1e-06, + "loss": 0.8412, + "mean_token_accuracy": 0.7355263829231262, + "num_tokens": 318017232.0, + "step": 12571 + }, + { + "epoch": 1.38062815725895, + "grad_norm": 2.0885345935821533, + "learning_rate": 1e-06, + "loss": 0.8536, + "mean_token_accuracy": 0.7270021438598633, + "num_tokens": 318045937.0, + "step": 12572 + }, + { + "epoch": 1.3807379749615638, + "grad_norm": 2.1680221557617188, + "learning_rate": 1e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.718449592590332, + "num_tokens": 318074030.0, + "step": 12573 + }, + { + "epoch": 1.3808477926641776, + "grad_norm": 2.3033339977264404, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7174816727638245, + "num_tokens": 318100031.0, + "step": 12574 + }, + { + "epoch": 1.380957610366791, + "grad_norm": 2.356441020965576, + "learning_rate": 1e-06, + "loss": 0.8839, + "mean_token_accuracy": 0.7245137691497803, + "num_tokens": 318122441.0, + "step": 12575 + }, + { + "epoch": 1.3810674280694049, + "grad_norm": 2.263416290283203, + "learning_rate": 1e-06, + "loss": 0.8782, + "mean_token_accuracy": 0.7225578427314758, + "num_tokens": 318148486.0, + "step": 12576 + }, + { + "epoch": 1.3811772457720184, + "grad_norm": 2.1632513999938965, + "learning_rate": 1e-06, + "loss": 0.885, + "mean_token_accuracy": 0.7315557599067688, + "num_tokens": 318176483.0, + "step": 12577 + }, + { + "epoch": 1.3812870634746321, + "grad_norm": 2.3109123706817627, + "learning_rate": 1e-06, + "loss": 0.9227, + "mean_token_accuracy": 0.7169163227081299, + "num_tokens": 318202035.0, + "step": 12578 + }, + { + "epoch": 1.381396881177246, + "grad_norm": 2.2998788356781006, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7096115350723267, + "num_tokens": 318226834.0, + "step": 12579 + }, + { + "epoch": 1.3815066988798594, + "grad_norm": 2.1650185585021973, + "learning_rate": 1e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.7062180638313293, + "num_tokens": 318256145.0, + "step": 12580 + }, + { + "epoch": 1.381616516582473, + "grad_norm": 2.5315663814544678, + "learning_rate": 1e-06, + "loss": 0.83, + "mean_token_accuracy": 0.740955114364624, + "num_tokens": 318277499.0, + "step": 12581 + }, + { + "epoch": 1.3817263342850867, + "grad_norm": 2.1396427154541016, + "learning_rate": 1e-06, + "loss": 0.97, + "mean_token_accuracy": 0.7005248069763184, + "num_tokens": 318308371.0, + "step": 12582 + }, + { + "epoch": 1.3818361519877005, + "grad_norm": 2.138533592224121, + "learning_rate": 1e-06, + "loss": 0.8561, + "mean_token_accuracy": 0.7356827259063721, + "num_tokens": 318337216.0, + "step": 12583 + }, + { + "epoch": 1.381945969690314, + "grad_norm": 2.097233295440674, + "learning_rate": 1e-06, + "loss": 0.9191, + "mean_token_accuracy": 0.7176486253738403, + "num_tokens": 318365367.0, + "step": 12584 + }, + { + "epoch": 1.3820557873929278, + "grad_norm": 2.422516345977783, + "learning_rate": 1e-06, + "loss": 0.9154, + "mean_token_accuracy": 0.7183542251586914, + "num_tokens": 318387710.0, + "step": 12585 + }, + { + "epoch": 1.3821656050955413, + "grad_norm": 2.3051698207855225, + "learning_rate": 1e-06, + "loss": 0.8791, + "mean_token_accuracy": 0.7279603481292725, + "num_tokens": 318412368.0, + "step": 12586 + }, + { + "epoch": 1.382275422798155, + "grad_norm": 2.196103811264038, + "learning_rate": 1e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.7069076299667358, + "num_tokens": 318440650.0, + "step": 12587 + }, + { + "epoch": 1.3823852405007688, + "grad_norm": 2.486128330230713, + "learning_rate": 1e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.7117633819580078, + "num_tokens": 318464733.0, + "step": 12588 + }, + { + "epoch": 1.3824950582033824, + "grad_norm": 2.3578855991363525, + "learning_rate": 1e-06, + "loss": 0.859, + "mean_token_accuracy": 0.7433352470397949, + "num_tokens": 318487355.0, + "step": 12589 + }, + { + "epoch": 1.3826048759059961, + "grad_norm": 2.0798580646514893, + "learning_rate": 1e-06, + "loss": 0.9025, + "mean_token_accuracy": 0.7212885618209839, + "num_tokens": 318515111.0, + "step": 12590 + }, + { + "epoch": 1.3827146936086097, + "grad_norm": 2.465275764465332, + "learning_rate": 1e-06, + "loss": 0.9485, + "mean_token_accuracy": 0.7062869071960449, + "num_tokens": 318539719.0, + "step": 12591 + }, + { + "epoch": 1.3828245113112234, + "grad_norm": 1.8715852499008179, + "learning_rate": 1e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.7252652645111084, + "num_tokens": 318569628.0, + "step": 12592 + }, + { + "epoch": 1.382934329013837, + "grad_norm": 2.223388910293579, + "learning_rate": 1e-06, + "loss": 0.7925, + "mean_token_accuracy": 0.7445180416107178, + "num_tokens": 318593491.0, + "step": 12593 + }, + { + "epoch": 1.3830441467164507, + "grad_norm": 2.6685643196105957, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.7184156179428101, + "num_tokens": 318613356.0, + "step": 12594 + }, + { + "epoch": 1.3831539644190642, + "grad_norm": 2.6747918128967285, + "learning_rate": 1e-06, + "loss": 0.8552, + "mean_token_accuracy": 0.7223222255706787, + "num_tokens": 318633494.0, + "step": 12595 + }, + { + "epoch": 1.383263782121678, + "grad_norm": 2.5924594402313232, + "learning_rate": 1e-06, + "loss": 0.8708, + "mean_token_accuracy": 0.7350491285324097, + "num_tokens": 318652352.0, + "step": 12596 + }, + { + "epoch": 1.3833735998242918, + "grad_norm": 2.1998913288116455, + "learning_rate": 1e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.7035103440284729, + "num_tokens": 318681812.0, + "step": 12597 + }, + { + "epoch": 1.3834834175269053, + "grad_norm": 2.2211434841156006, + "learning_rate": 1e-06, + "loss": 0.8987, + "mean_token_accuracy": 0.7238272428512573, + "num_tokens": 318706876.0, + "step": 12598 + }, + { + "epoch": 1.383593235229519, + "grad_norm": 2.2988321781158447, + "learning_rate": 1e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7099347114562988, + "num_tokens": 318730329.0, + "step": 12599 + }, + { + "epoch": 1.3837030529321326, + "grad_norm": 2.373880386352539, + "learning_rate": 1e-06, + "loss": 0.9237, + "mean_token_accuracy": 0.7198863625526428, + "num_tokens": 318753849.0, + "step": 12600 + }, + { + "epoch": 1.3838128706347463, + "grad_norm": 2.0233054161071777, + "learning_rate": 1e-06, + "loss": 0.8493, + "mean_token_accuracy": 0.7348819971084595, + "num_tokens": 318784333.0, + "step": 12601 + }, + { + "epoch": 1.38392268833736, + "grad_norm": 2.4223456382751465, + "learning_rate": 1e-06, + "loss": 0.972, + "mean_token_accuracy": 0.6986111402511597, + "num_tokens": 318807158.0, + "step": 12602 + }, + { + "epoch": 1.3840325060399736, + "grad_norm": 2.191622734069824, + "learning_rate": 1e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.7018647193908691, + "num_tokens": 318833652.0, + "step": 12603 + }, + { + "epoch": 1.3841423237425874, + "grad_norm": 2.437246561050415, + "learning_rate": 1e-06, + "loss": 0.8912, + "mean_token_accuracy": 0.7297316789627075, + "num_tokens": 318860231.0, + "step": 12604 + }, + { + "epoch": 1.384252141445201, + "grad_norm": 2.2732129096984863, + "learning_rate": 1e-06, + "loss": 0.7846, + "mean_token_accuracy": 0.7556246519088745, + "num_tokens": 318883911.0, + "step": 12605 + }, + { + "epoch": 1.3843619591478147, + "grad_norm": 2.4893651008605957, + "learning_rate": 1e-06, + "loss": 0.799, + "mean_token_accuracy": 0.7514361143112183, + "num_tokens": 318903362.0, + "step": 12606 + }, + { + "epoch": 1.3844717768504282, + "grad_norm": 2.056999683380127, + "learning_rate": 1e-06, + "loss": 0.9042, + "mean_token_accuracy": 0.7227532863616943, + "num_tokens": 318932870.0, + "step": 12607 + }, + { + "epoch": 1.384581594553042, + "grad_norm": 2.2836432456970215, + "learning_rate": 1e-06, + "loss": 0.9912, + "mean_token_accuracy": 0.6944910287857056, + "num_tokens": 318959591.0, + "step": 12608 + }, + { + "epoch": 1.3846914122556555, + "grad_norm": 2.000195026397705, + "learning_rate": 1e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.7067276835441589, + "num_tokens": 318991897.0, + "step": 12609 + }, + { + "epoch": 1.3848012299582693, + "grad_norm": 2.1092512607574463, + "learning_rate": 1e-06, + "loss": 1.0227, + "mean_token_accuracy": 0.6858844757080078, + "num_tokens": 319021568.0, + "step": 12610 + }, + { + "epoch": 1.384911047660883, + "grad_norm": 2.002824306488037, + "learning_rate": 1e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.7165660858154297, + "num_tokens": 319050641.0, + "step": 12611 + }, + { + "epoch": 1.3850208653634966, + "grad_norm": 2.5694961547851562, + "learning_rate": 1e-06, + "loss": 0.8089, + "mean_token_accuracy": 0.7450388669967651, + "num_tokens": 319070056.0, + "step": 12612 + }, + { + "epoch": 1.3851306830661103, + "grad_norm": 2.1981985569000244, + "learning_rate": 1e-06, + "loss": 0.9788, + "mean_token_accuracy": 0.706179141998291, + "num_tokens": 319099200.0, + "step": 12613 + }, + { + "epoch": 1.3852405007687238, + "grad_norm": 2.3701882362365723, + "learning_rate": 1e-06, + "loss": 0.9037, + "mean_token_accuracy": 0.7180293798446655, + "num_tokens": 319122737.0, + "step": 12614 + }, + { + "epoch": 1.3853503184713376, + "grad_norm": 2.0539612770080566, + "learning_rate": 1e-06, + "loss": 0.976, + "mean_token_accuracy": 0.6978973150253296, + "num_tokens": 319154739.0, + "step": 12615 + }, + { + "epoch": 1.3854601361739514, + "grad_norm": 2.275824785232544, + "learning_rate": 1e-06, + "loss": 0.9249, + "mean_token_accuracy": 0.7157007455825806, + "num_tokens": 319180347.0, + "step": 12616 + }, + { + "epoch": 1.385569953876565, + "grad_norm": 2.512834072113037, + "learning_rate": 1e-06, + "loss": 0.969, + "mean_token_accuracy": 0.7023119926452637, + "num_tokens": 319203972.0, + "step": 12617 + }, + { + "epoch": 1.3856797715791784, + "grad_norm": 2.143712282180786, + "learning_rate": 1e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7092507481575012, + "num_tokens": 319232512.0, + "step": 12618 + }, + { + "epoch": 1.3857895892817922, + "grad_norm": 2.3726093769073486, + "learning_rate": 1e-06, + "loss": 0.8578, + "mean_token_accuracy": 0.7252010107040405, + "num_tokens": 319255151.0, + "step": 12619 + }, + { + "epoch": 1.385899406984406, + "grad_norm": 2.220705032348633, + "learning_rate": 1e-06, + "loss": 0.8915, + "mean_token_accuracy": 0.7194463014602661, + "num_tokens": 319280721.0, + "step": 12620 + }, + { + "epoch": 1.3860092246870195, + "grad_norm": 1.9314560890197754, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7072519063949585, + "num_tokens": 319313367.0, + "step": 12621 + }, + { + "epoch": 1.3861190423896332, + "grad_norm": 2.4159140586853027, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7079334259033203, + "num_tokens": 319337161.0, + "step": 12622 + }, + { + "epoch": 1.3862288600922468, + "grad_norm": 2.212925672531128, + "learning_rate": 1e-06, + "loss": 0.849, + "mean_token_accuracy": 0.7298147082328796, + "num_tokens": 319364271.0, + "step": 12623 + }, + { + "epoch": 1.3863386777948605, + "grad_norm": 2.317488193511963, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7126920223236084, + "num_tokens": 319389469.0, + "step": 12624 + }, + { + "epoch": 1.3864484954974743, + "grad_norm": 2.1744580268859863, + "learning_rate": 1e-06, + "loss": 0.9373, + "mean_token_accuracy": 0.7121907472610474, + "num_tokens": 319417056.0, + "step": 12625 + }, + { + "epoch": 1.3865583132000878, + "grad_norm": 2.4193172454833984, + "learning_rate": 1e-06, + "loss": 0.8397, + "mean_token_accuracy": 0.7365995645523071, + "num_tokens": 319437385.0, + "step": 12626 + }, + { + "epoch": 1.3866681309027016, + "grad_norm": 2.1977310180664062, + "learning_rate": 1e-06, + "loss": 0.9963, + "mean_token_accuracy": 0.6972192525863647, + "num_tokens": 319467449.0, + "step": 12627 + }, + { + "epoch": 1.3867779486053151, + "grad_norm": 2.5496842861175537, + "learning_rate": 1e-06, + "loss": 0.8045, + "mean_token_accuracy": 0.7455233335494995, + "num_tokens": 319488538.0, + "step": 12628 + }, + { + "epoch": 1.3868877663079289, + "grad_norm": 2.3377649784088135, + "learning_rate": 1e-06, + "loss": 0.7913, + "mean_token_accuracy": 0.7436755299568176, + "num_tokens": 319511035.0, + "step": 12629 + }, + { + "epoch": 1.3869975840105426, + "grad_norm": 2.092202663421631, + "learning_rate": 1e-06, + "loss": 0.845, + "mean_token_accuracy": 0.7319869995117188, + "num_tokens": 319537431.0, + "step": 12630 + }, + { + "epoch": 1.3871074017131562, + "grad_norm": 2.2528467178344727, + "learning_rate": 1e-06, + "loss": 0.8775, + "mean_token_accuracy": 0.7304009199142456, + "num_tokens": 319562806.0, + "step": 12631 + }, + { + "epoch": 1.3872172194157697, + "grad_norm": 2.0533790588378906, + "learning_rate": 1e-06, + "loss": 0.894, + "mean_token_accuracy": 0.717751145362854, + "num_tokens": 319592034.0, + "step": 12632 + }, + { + "epoch": 1.3873270371183835, + "grad_norm": 2.0985937118530273, + "learning_rate": 1e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.7001118659973145, + "num_tokens": 319623316.0, + "step": 12633 + }, + { + "epoch": 1.3874368548209972, + "grad_norm": 2.2664358615875244, + "learning_rate": 1e-06, + "loss": 0.8825, + "mean_token_accuracy": 0.723178505897522, + "num_tokens": 319647791.0, + "step": 12634 + }, + { + "epoch": 1.3875466725236107, + "grad_norm": 2.445789098739624, + "learning_rate": 1e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.7119125127792358, + "num_tokens": 319670503.0, + "step": 12635 + }, + { + "epoch": 1.3876564902262245, + "grad_norm": 2.262484073638916, + "learning_rate": 1e-06, + "loss": 1.0268, + "mean_token_accuracy": 0.6803255081176758, + "num_tokens": 319697957.0, + "step": 12636 + }, + { + "epoch": 1.387766307928838, + "grad_norm": 2.6370129585266113, + "learning_rate": 1e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.7205013036727905, + "num_tokens": 319718695.0, + "step": 12637 + }, + { + "epoch": 1.3878761256314518, + "grad_norm": 2.175093412399292, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7117119431495667, + "num_tokens": 319748456.0, + "step": 12638 + }, + { + "epoch": 1.3879859433340656, + "grad_norm": 2.228123664855957, + "learning_rate": 1e-06, + "loss": 0.8715, + "mean_token_accuracy": 0.7278643250465393, + "num_tokens": 319772415.0, + "step": 12639 + }, + { + "epoch": 1.388095761036679, + "grad_norm": 2.1564152240753174, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7163269519805908, + "num_tokens": 319800688.0, + "step": 12640 + }, + { + "epoch": 1.3882055787392928, + "grad_norm": 2.6450631618499756, + "learning_rate": 1e-06, + "loss": 0.8206, + "mean_token_accuracy": 0.7414742708206177, + "num_tokens": 319819317.0, + "step": 12641 + }, + { + "epoch": 1.3883153964419064, + "grad_norm": 2.420718193054199, + "learning_rate": 1e-06, + "loss": 0.8299, + "mean_token_accuracy": 0.7428998351097107, + "num_tokens": 319841273.0, + "step": 12642 + }, + { + "epoch": 1.3884252141445201, + "grad_norm": 2.348839044570923, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7198963761329651, + "num_tokens": 319864538.0, + "step": 12643 + }, + { + "epoch": 1.388535031847134, + "grad_norm": 2.668449640274048, + "learning_rate": 1e-06, + "loss": 0.8903, + "mean_token_accuracy": 0.7253333330154419, + "num_tokens": 319884888.0, + "step": 12644 + }, + { + "epoch": 1.3886448495497474, + "grad_norm": 2.1874091625213623, + "learning_rate": 1e-06, + "loss": 0.8295, + "mean_token_accuracy": 0.7371605634689331, + "num_tokens": 319912940.0, + "step": 12645 + }, + { + "epoch": 1.388754667252361, + "grad_norm": 2.268094062805176, + "learning_rate": 1e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.7135207056999207, + "num_tokens": 319939177.0, + "step": 12646 + }, + { + "epoch": 1.3888644849549747, + "grad_norm": 2.4085094928741455, + "learning_rate": 1e-06, + "loss": 0.871, + "mean_token_accuracy": 0.722723126411438, + "num_tokens": 319961274.0, + "step": 12647 + }, + { + "epoch": 1.3889743026575885, + "grad_norm": 2.3403162956237793, + "learning_rate": 1e-06, + "loss": 0.8689, + "mean_token_accuracy": 0.7238527536392212, + "num_tokens": 319985277.0, + "step": 12648 + }, + { + "epoch": 1.389084120360202, + "grad_norm": 2.674773693084717, + "learning_rate": 1e-06, + "loss": 0.806, + "mean_token_accuracy": 0.7445268034934998, + "num_tokens": 320004848.0, + "step": 12649 + }, + { + "epoch": 1.3891939380628158, + "grad_norm": 2.3176450729370117, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.696100115776062, + "num_tokens": 320030824.0, + "step": 12650 + }, + { + "epoch": 1.3893037557654293, + "grad_norm": 2.081005573272705, + "learning_rate": 1e-06, + "loss": 0.8701, + "mean_token_accuracy": 0.7277967929840088, + "num_tokens": 320059999.0, + "step": 12651 + }, + { + "epoch": 1.389413573468043, + "grad_norm": 2.2495381832122803, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7171918153762817, + "num_tokens": 320088228.0, + "step": 12652 + }, + { + "epoch": 1.3895233911706568, + "grad_norm": 2.059680700302124, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7058037519454956, + "num_tokens": 320116970.0, + "step": 12653 + }, + { + "epoch": 1.3896332088732704, + "grad_norm": 2.3238272666931152, + "learning_rate": 1e-06, + "loss": 0.9274, + "mean_token_accuracy": 0.7159367799758911, + "num_tokens": 320142729.0, + "step": 12654 + }, + { + "epoch": 1.389743026575884, + "grad_norm": 1.9993559122085571, + "learning_rate": 1e-06, + "loss": 0.9743, + "mean_token_accuracy": 0.7052972316741943, + "num_tokens": 320174910.0, + "step": 12655 + }, + { + "epoch": 1.3898528442784976, + "grad_norm": 2.648695468902588, + "learning_rate": 1e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.7182193398475647, + "num_tokens": 320194880.0, + "step": 12656 + }, + { + "epoch": 1.3899626619811114, + "grad_norm": 2.0731663703918457, + "learning_rate": 1e-06, + "loss": 0.8938, + "mean_token_accuracy": 0.7245138883590698, + "num_tokens": 320221998.0, + "step": 12657 + }, + { + "epoch": 1.390072479683725, + "grad_norm": 2.1053504943847656, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7083698511123657, + "num_tokens": 320253027.0, + "step": 12658 + }, + { + "epoch": 1.3901822973863387, + "grad_norm": 2.3279924392700195, + "learning_rate": 1e-06, + "loss": 0.8819, + "mean_token_accuracy": 0.7258753776550293, + "num_tokens": 320275586.0, + "step": 12659 + }, + { + "epoch": 1.3902921150889522, + "grad_norm": 2.36208438873291, + "learning_rate": 1e-06, + "loss": 0.7981, + "mean_token_accuracy": 0.746096134185791, + "num_tokens": 320298684.0, + "step": 12660 + }, + { + "epoch": 1.390401932791566, + "grad_norm": 2.454592704772949, + "learning_rate": 1e-06, + "loss": 0.8818, + "mean_token_accuracy": 0.7324516177177429, + "num_tokens": 320322093.0, + "step": 12661 + }, + { + "epoch": 1.3905117504941797, + "grad_norm": 2.2576494216918945, + "learning_rate": 1e-06, + "loss": 0.8094, + "mean_token_accuracy": 0.7429469227790833, + "num_tokens": 320345110.0, + "step": 12662 + }, + { + "epoch": 1.3906215681967933, + "grad_norm": 2.315199136734009, + "learning_rate": 1e-06, + "loss": 0.8361, + "mean_token_accuracy": 0.7340055108070374, + "num_tokens": 320369324.0, + "step": 12663 + }, + { + "epoch": 1.390731385899407, + "grad_norm": 2.0965147018432617, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7090216279029846, + "num_tokens": 320398978.0, + "step": 12664 + }, + { + "epoch": 1.3908412036020206, + "grad_norm": 2.4949655532836914, + "learning_rate": 1e-06, + "loss": 0.9159, + "mean_token_accuracy": 0.7154898047447205, + "num_tokens": 320421272.0, + "step": 12665 + }, + { + "epoch": 1.3909510213046343, + "grad_norm": 2.316316604614258, + "learning_rate": 1e-06, + "loss": 0.8872, + "mean_token_accuracy": 0.730057954788208, + "num_tokens": 320444382.0, + "step": 12666 + }, + { + "epoch": 1.391060839007248, + "grad_norm": 2.1606485843658447, + "learning_rate": 1e-06, + "loss": 0.8629, + "mean_token_accuracy": 0.7273766994476318, + "num_tokens": 320472314.0, + "step": 12667 + }, + { + "epoch": 1.3911706567098616, + "grad_norm": 2.161311626434326, + "learning_rate": 1e-06, + "loss": 0.789, + "mean_token_accuracy": 0.7411059141159058, + "num_tokens": 320497670.0, + "step": 12668 + }, + { + "epoch": 1.3912804744124752, + "grad_norm": 2.288573980331421, + "learning_rate": 1e-06, + "loss": 0.9958, + "mean_token_accuracy": 0.698143482208252, + "num_tokens": 320522256.0, + "step": 12669 + }, + { + "epoch": 1.391390292115089, + "grad_norm": 1.878759503364563, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7023472785949707, + "num_tokens": 320555030.0, + "step": 12670 + }, + { + "epoch": 1.3915001098177027, + "grad_norm": 2.165961503982544, + "learning_rate": 1e-06, + "loss": 0.8233, + "mean_token_accuracy": 0.7287325859069824, + "num_tokens": 320581888.0, + "step": 12671 + }, + { + "epoch": 1.3916099275203162, + "grad_norm": 2.4351797103881836, + "learning_rate": 1e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.7216020822525024, + "num_tokens": 320606403.0, + "step": 12672 + }, + { + "epoch": 1.39171974522293, + "grad_norm": 2.0938971042633057, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7137824296951294, + "num_tokens": 320632996.0, + "step": 12673 + }, + { + "epoch": 1.3918295629255435, + "grad_norm": 2.398311138153076, + "learning_rate": 1e-06, + "loss": 0.7829, + "mean_token_accuracy": 0.7434283494949341, + "num_tokens": 320653270.0, + "step": 12674 + }, + { + "epoch": 1.3919393806281573, + "grad_norm": 2.181342363357544, + "learning_rate": 1e-06, + "loss": 0.8607, + "mean_token_accuracy": 0.7342901229858398, + "num_tokens": 320678320.0, + "step": 12675 + }, + { + "epoch": 1.392049198330771, + "grad_norm": 2.170768976211548, + "learning_rate": 1e-06, + "loss": 0.811, + "mean_token_accuracy": 0.7380335330963135, + "num_tokens": 320704290.0, + "step": 12676 + }, + { + "epoch": 1.3921590160333845, + "grad_norm": 2.283290386199951, + "learning_rate": 1e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.723647952079773, + "num_tokens": 320729566.0, + "step": 12677 + }, + { + "epoch": 1.3922688337359983, + "grad_norm": 2.502955198287964, + "learning_rate": 1e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.7216065526008606, + "num_tokens": 320750238.0, + "step": 12678 + }, + { + "epoch": 1.3923786514386118, + "grad_norm": 2.3787600994110107, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7133923768997192, + "num_tokens": 320773570.0, + "step": 12679 + }, + { + "epoch": 1.3924884691412256, + "grad_norm": 2.259446382522583, + "learning_rate": 1e-06, + "loss": 0.82, + "mean_token_accuracy": 0.7434926629066467, + "num_tokens": 320797143.0, + "step": 12680 + }, + { + "epoch": 1.3925982868438394, + "grad_norm": 2.23980975151062, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7311171293258667, + "num_tokens": 320823012.0, + "step": 12681 + }, + { + "epoch": 1.3927081045464529, + "grad_norm": 2.3320891857147217, + "learning_rate": 1e-06, + "loss": 0.8294, + "mean_token_accuracy": 0.7387999296188354, + "num_tokens": 320846534.0, + "step": 12682 + }, + { + "epoch": 1.3928179222490664, + "grad_norm": 2.427913188934326, + "learning_rate": 1e-06, + "loss": 0.8462, + "mean_token_accuracy": 0.7362747192382812, + "num_tokens": 320867345.0, + "step": 12683 + }, + { + "epoch": 1.3929277399516802, + "grad_norm": 2.275663137435913, + "learning_rate": 1e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7265245318412781, + "num_tokens": 320891289.0, + "step": 12684 + }, + { + "epoch": 1.393037557654294, + "grad_norm": 2.298264265060425, + "learning_rate": 1e-06, + "loss": 0.8207, + "mean_token_accuracy": 0.7535644173622131, + "num_tokens": 320913274.0, + "step": 12685 + }, + { + "epoch": 1.3931473753569075, + "grad_norm": 2.1049065589904785, + "learning_rate": 1e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.7068648338317871, + "num_tokens": 320944073.0, + "step": 12686 + }, + { + "epoch": 1.3932571930595212, + "grad_norm": 2.431910753250122, + "learning_rate": 1e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.7227774262428284, + "num_tokens": 320968187.0, + "step": 12687 + }, + { + "epoch": 1.3933670107621348, + "grad_norm": 2.7205774784088135, + "learning_rate": 1e-06, + "loss": 0.8384, + "mean_token_accuracy": 0.739655613899231, + "num_tokens": 320987119.0, + "step": 12688 + }, + { + "epoch": 1.3934768284647485, + "grad_norm": 2.1480305194854736, + "learning_rate": 1e-06, + "loss": 0.9356, + "mean_token_accuracy": 0.7119604349136353, + "num_tokens": 321015917.0, + "step": 12689 + }, + { + "epoch": 1.3935866461673623, + "grad_norm": 2.536870002746582, + "learning_rate": 1e-06, + "loss": 0.839, + "mean_token_accuracy": 0.7333974242210388, + "num_tokens": 321037427.0, + "step": 12690 + }, + { + "epoch": 1.3936964638699758, + "grad_norm": 2.264544725418091, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7139889001846313, + "num_tokens": 321064380.0, + "step": 12691 + }, + { + "epoch": 1.3938062815725896, + "grad_norm": 2.335618734359741, + "learning_rate": 1e-06, + "loss": 0.8771, + "mean_token_accuracy": 0.7352060079574585, + "num_tokens": 321087146.0, + "step": 12692 + }, + { + "epoch": 1.393916099275203, + "grad_norm": 2.2395317554473877, + "learning_rate": 1e-06, + "loss": 0.8702, + "mean_token_accuracy": 0.7387720942497253, + "num_tokens": 321112355.0, + "step": 12693 + }, + { + "epoch": 1.3940259169778169, + "grad_norm": 2.112614631652832, + "learning_rate": 1e-06, + "loss": 0.8654, + "mean_token_accuracy": 0.7316357493400574, + "num_tokens": 321139412.0, + "step": 12694 + }, + { + "epoch": 1.3941357346804306, + "grad_norm": 2.2028348445892334, + "learning_rate": 1e-06, + "loss": 0.8895, + "mean_token_accuracy": 0.724702775478363, + "num_tokens": 321164667.0, + "step": 12695 + }, + { + "epoch": 1.3942455523830442, + "grad_norm": 2.168020725250244, + "learning_rate": 1e-06, + "loss": 0.9356, + "mean_token_accuracy": 0.7208767533302307, + "num_tokens": 321191065.0, + "step": 12696 + }, + { + "epoch": 1.3943553700856577, + "grad_norm": 2.629162311553955, + "learning_rate": 1e-06, + "loss": 0.8179, + "mean_token_accuracy": 0.7459800839424133, + "num_tokens": 321211474.0, + "step": 12697 + }, + { + "epoch": 1.3944651877882714, + "grad_norm": 2.187746524810791, + "learning_rate": 1e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.7162274122238159, + "num_tokens": 321238436.0, + "step": 12698 + }, + { + "epoch": 1.3945750054908852, + "grad_norm": 2.800032615661621, + "learning_rate": 1e-06, + "loss": 0.8625, + "mean_token_accuracy": 0.7253943681716919, + "num_tokens": 321257074.0, + "step": 12699 + }, + { + "epoch": 1.3946848231934987, + "grad_norm": 2.3536465167999268, + "learning_rate": 1e-06, + "loss": 0.8751, + "mean_token_accuracy": 0.7244913578033447, + "num_tokens": 321280266.0, + "step": 12700 + }, + { + "epoch": 1.3947946408961125, + "grad_norm": 2.730853796005249, + "learning_rate": 1e-06, + "loss": 0.8516, + "mean_token_accuracy": 0.7351992726325989, + "num_tokens": 321299892.0, + "step": 12701 + }, + { + "epoch": 1.394904458598726, + "grad_norm": 2.798238754272461, + "learning_rate": 1e-06, + "loss": 0.8292, + "mean_token_accuracy": 0.7338752746582031, + "num_tokens": 321318648.0, + "step": 12702 + }, + { + "epoch": 1.3950142763013398, + "grad_norm": 1.9863250255584717, + "learning_rate": 1e-06, + "loss": 0.983, + "mean_token_accuracy": 0.6874697208404541, + "num_tokens": 321351216.0, + "step": 12703 + }, + { + "epoch": 1.3951240940039535, + "grad_norm": 2.1366846561431885, + "learning_rate": 1e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.7072635889053345, + "num_tokens": 321381019.0, + "step": 12704 + }, + { + "epoch": 1.395233911706567, + "grad_norm": 2.261134386062622, + "learning_rate": 1e-06, + "loss": 0.8034, + "mean_token_accuracy": 0.7457653284072876, + "num_tokens": 321405183.0, + "step": 12705 + }, + { + "epoch": 1.3953437294091808, + "grad_norm": 2.1034297943115234, + "learning_rate": 1e-06, + "loss": 0.8484, + "mean_token_accuracy": 0.730110764503479, + "num_tokens": 321430838.0, + "step": 12706 + }, + { + "epoch": 1.3954535471117944, + "grad_norm": 2.0827829837799072, + "learning_rate": 1e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.7025817632675171, + "num_tokens": 321459530.0, + "step": 12707 + }, + { + "epoch": 1.3955633648144081, + "grad_norm": 2.389552593231201, + "learning_rate": 1e-06, + "loss": 0.9133, + "mean_token_accuracy": 0.7207555770874023, + "num_tokens": 321485265.0, + "step": 12708 + }, + { + "epoch": 1.3956731825170219, + "grad_norm": 2.1338133811950684, + "learning_rate": 1e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.7042263150215149, + "num_tokens": 321513751.0, + "step": 12709 + }, + { + "epoch": 1.3957830002196354, + "grad_norm": 2.143552780151367, + "learning_rate": 1e-06, + "loss": 0.8751, + "mean_token_accuracy": 0.7342219352722168, + "num_tokens": 321541158.0, + "step": 12710 + }, + { + "epoch": 1.395892817922249, + "grad_norm": 2.3263235092163086, + "learning_rate": 1e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7180371284484863, + "num_tokens": 321566249.0, + "step": 12711 + }, + { + "epoch": 1.3960026356248627, + "grad_norm": 2.1166908740997314, + "learning_rate": 1e-06, + "loss": 0.9806, + "mean_token_accuracy": 0.6952869892120361, + "num_tokens": 321595738.0, + "step": 12712 + }, + { + "epoch": 1.3961124533274765, + "grad_norm": 2.0133652687072754, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.7040964365005493, + "num_tokens": 321625331.0, + "step": 12713 + }, + { + "epoch": 1.39622227103009, + "grad_norm": 2.110781669616699, + "learning_rate": 1e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.7153759598731995, + "num_tokens": 321655932.0, + "step": 12714 + }, + { + "epoch": 1.3963320887327038, + "grad_norm": 2.105875253677368, + "learning_rate": 1e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.7047989368438721, + "num_tokens": 321686058.0, + "step": 12715 + }, + { + "epoch": 1.3964419064353173, + "grad_norm": 2.5820975303649902, + "learning_rate": 1e-06, + "loss": 0.819, + "mean_token_accuracy": 0.7396984100341797, + "num_tokens": 321705889.0, + "step": 12716 + }, + { + "epoch": 1.396551724137931, + "grad_norm": 2.3557627201080322, + "learning_rate": 1e-06, + "loss": 0.91, + "mean_token_accuracy": 0.7166522741317749, + "num_tokens": 321731719.0, + "step": 12717 + }, + { + "epoch": 1.3966615418405448, + "grad_norm": 2.1771981716156006, + "learning_rate": 1e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.6988388299942017, + "num_tokens": 321760686.0, + "step": 12718 + }, + { + "epoch": 1.3967713595431583, + "grad_norm": 2.1315689086914062, + "learning_rate": 1e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.7030173540115356, + "num_tokens": 321788695.0, + "step": 12719 + }, + { + "epoch": 1.396881177245772, + "grad_norm": 2.2092232704162598, + "learning_rate": 1e-06, + "loss": 0.902, + "mean_token_accuracy": 0.7166840434074402, + "num_tokens": 321814650.0, + "step": 12720 + }, + { + "epoch": 1.3969909949483856, + "grad_norm": 1.9473737478256226, + "learning_rate": 1e-06, + "loss": 0.8619, + "mean_token_accuracy": 0.7237594127655029, + "num_tokens": 321845990.0, + "step": 12721 + }, + { + "epoch": 1.3971008126509994, + "grad_norm": 2.2713100910186768, + "learning_rate": 1e-06, + "loss": 0.8275, + "mean_token_accuracy": 0.7408300638198853, + "num_tokens": 321872466.0, + "step": 12722 + }, + { + "epoch": 1.397210630353613, + "grad_norm": 2.2587883472442627, + "learning_rate": 1e-06, + "loss": 0.8743, + "mean_token_accuracy": 0.7227113246917725, + "num_tokens": 321897784.0, + "step": 12723 + }, + { + "epoch": 1.3973204480562267, + "grad_norm": 2.0151829719543457, + "learning_rate": 1e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7178376913070679, + "num_tokens": 321926993.0, + "step": 12724 + }, + { + "epoch": 1.3974302657588402, + "grad_norm": 2.127797842025757, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.715325117111206, + "num_tokens": 321955099.0, + "step": 12725 + }, + { + "epoch": 1.397540083461454, + "grad_norm": 2.093318223953247, + "learning_rate": 1e-06, + "loss": 0.9256, + "mean_token_accuracy": 0.7117751836776733, + "num_tokens": 321985519.0, + "step": 12726 + }, + { + "epoch": 1.3976499011640677, + "grad_norm": 2.286391258239746, + "learning_rate": 1e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7152479887008667, + "num_tokens": 322012087.0, + "step": 12727 + }, + { + "epoch": 1.3977597188666813, + "grad_norm": 2.2896084785461426, + "learning_rate": 1e-06, + "loss": 1.0337, + "mean_token_accuracy": 0.6858724355697632, + "num_tokens": 322036992.0, + "step": 12728 + }, + { + "epoch": 1.397869536569295, + "grad_norm": 2.0906643867492676, + "learning_rate": 1e-06, + "loss": 0.9724, + "mean_token_accuracy": 0.6986538171768188, + "num_tokens": 322066039.0, + "step": 12729 + }, + { + "epoch": 1.3979793542719086, + "grad_norm": 2.1240689754486084, + "learning_rate": 1e-06, + "loss": 0.9665, + "mean_token_accuracy": 0.7078869938850403, + "num_tokens": 322096980.0, + "step": 12730 + }, + { + "epoch": 1.3980891719745223, + "grad_norm": 2.692065477371216, + "learning_rate": 1e-06, + "loss": 0.8544, + "mean_token_accuracy": 0.7332397699356079, + "num_tokens": 322115838.0, + "step": 12731 + }, + { + "epoch": 1.398198989677136, + "grad_norm": 2.487027168273926, + "learning_rate": 1e-06, + "loss": 0.8312, + "mean_token_accuracy": 0.7434970736503601, + "num_tokens": 322137036.0, + "step": 12732 + }, + { + "epoch": 1.3983088073797496, + "grad_norm": 2.267268657684326, + "learning_rate": 1e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.7157648801803589, + "num_tokens": 322163086.0, + "step": 12733 + }, + { + "epoch": 1.3984186250823631, + "grad_norm": 2.1697773933410645, + "learning_rate": 1e-06, + "loss": 0.8355, + "mean_token_accuracy": 0.7399373650550842, + "num_tokens": 322188826.0, + "step": 12734 + }, + { + "epoch": 1.398528442784977, + "grad_norm": 2.5082955360412598, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.7059404850006104, + "num_tokens": 322209650.0, + "step": 12735 + }, + { + "epoch": 1.3986382604875907, + "grad_norm": 2.1670949459075928, + "learning_rate": 1e-06, + "loss": 1.0, + "mean_token_accuracy": 0.6905139088630676, + "num_tokens": 322237478.0, + "step": 12736 + }, + { + "epoch": 1.3987480781902042, + "grad_norm": 2.3385181427001953, + "learning_rate": 1e-06, + "loss": 0.8959, + "mean_token_accuracy": 0.7152342796325684, + "num_tokens": 322262142.0, + "step": 12737 + }, + { + "epoch": 1.398857895892818, + "grad_norm": 2.2802486419677734, + "learning_rate": 1e-06, + "loss": 0.8777, + "mean_token_accuracy": 0.7187412977218628, + "num_tokens": 322286814.0, + "step": 12738 + }, + { + "epoch": 1.3989677135954315, + "grad_norm": 2.108093500137329, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.709088921546936, + "num_tokens": 322314488.0, + "step": 12739 + }, + { + "epoch": 1.3990775312980452, + "grad_norm": 2.6095521450042725, + "learning_rate": 1e-06, + "loss": 0.8802, + "mean_token_accuracy": 0.7185596823692322, + "num_tokens": 322335062.0, + "step": 12740 + }, + { + "epoch": 1.399187349000659, + "grad_norm": 2.352442979812622, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7087118625640869, + "num_tokens": 322359095.0, + "step": 12741 + }, + { + "epoch": 1.3992971667032725, + "grad_norm": 2.230865716934204, + "learning_rate": 1e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.7046449780464172, + "num_tokens": 322384243.0, + "step": 12742 + }, + { + "epoch": 1.3994069844058863, + "grad_norm": 2.369497299194336, + "learning_rate": 1e-06, + "loss": 0.8922, + "mean_token_accuracy": 0.7203167676925659, + "num_tokens": 322408299.0, + "step": 12743 + }, + { + "epoch": 1.3995168021084998, + "grad_norm": 2.3056185245513916, + "learning_rate": 1e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.7122742533683777, + "num_tokens": 322433661.0, + "step": 12744 + }, + { + "epoch": 1.3996266198111136, + "grad_norm": 2.3075098991394043, + "learning_rate": 1e-06, + "loss": 0.8893, + "mean_token_accuracy": 0.7209922075271606, + "num_tokens": 322458952.0, + "step": 12745 + }, + { + "epoch": 1.3997364375137273, + "grad_norm": 2.415747880935669, + "learning_rate": 1e-06, + "loss": 0.9227, + "mean_token_accuracy": 0.7084449529647827, + "num_tokens": 322481654.0, + "step": 12746 + }, + { + "epoch": 1.3998462552163409, + "grad_norm": 2.0785069465637207, + "learning_rate": 1e-06, + "loss": 0.9511, + "mean_token_accuracy": 0.7134355902671814, + "num_tokens": 322510715.0, + "step": 12747 + }, + { + "epoch": 1.3999560729189544, + "grad_norm": 2.124098062515259, + "learning_rate": 1e-06, + "loss": 1.0109, + "mean_token_accuracy": 0.6856890916824341, + "num_tokens": 322538468.0, + "step": 12748 + }, + { + "epoch": 1.4000658906215682, + "grad_norm": 2.3131401538848877, + "learning_rate": 1e-06, + "loss": 0.9051, + "mean_token_accuracy": 0.7235454320907593, + "num_tokens": 322562816.0, + "step": 12749 + }, + { + "epoch": 1.400175708324182, + "grad_norm": 2.328472375869751, + "learning_rate": 1e-06, + "loss": 0.8703, + "mean_token_accuracy": 0.7218896150588989, + "num_tokens": 322585363.0, + "step": 12750 + }, + { + "epoch": 1.4002855260267955, + "grad_norm": 2.308706045150757, + "learning_rate": 1e-06, + "loss": 0.9033, + "mean_token_accuracy": 0.7163759469985962, + "num_tokens": 322608952.0, + "step": 12751 + }, + { + "epoch": 1.4003953437294092, + "grad_norm": 2.3036410808563232, + "learning_rate": 1e-06, + "loss": 0.849, + "mean_token_accuracy": 0.7402893304824829, + "num_tokens": 322632452.0, + "step": 12752 + }, + { + "epoch": 1.4005051614320227, + "grad_norm": 2.180954933166504, + "learning_rate": 1e-06, + "loss": 0.829, + "mean_token_accuracy": 0.7354347705841064, + "num_tokens": 322658817.0, + "step": 12753 + }, + { + "epoch": 1.4006149791346365, + "grad_norm": 2.118713855743408, + "learning_rate": 1e-06, + "loss": 0.9174, + "mean_token_accuracy": 0.7117830514907837, + "num_tokens": 322684421.0, + "step": 12754 + }, + { + "epoch": 1.4007247968372503, + "grad_norm": 2.3274800777435303, + "learning_rate": 1e-06, + "loss": 0.8095, + "mean_token_accuracy": 0.7442681789398193, + "num_tokens": 322707390.0, + "step": 12755 + }, + { + "epoch": 1.4008346145398638, + "grad_norm": 2.2026290893554688, + "learning_rate": 1e-06, + "loss": 0.9894, + "mean_token_accuracy": 0.6989227533340454, + "num_tokens": 322735697.0, + "step": 12756 + }, + { + "epoch": 1.4009444322424776, + "grad_norm": 2.656947612762451, + "learning_rate": 1e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.7187905311584473, + "num_tokens": 322755454.0, + "step": 12757 + }, + { + "epoch": 1.401054249945091, + "grad_norm": 2.243711233139038, + "learning_rate": 1e-06, + "loss": 0.9801, + "mean_token_accuracy": 0.6966468691825867, + "num_tokens": 322782441.0, + "step": 12758 + }, + { + "epoch": 1.4011640676477048, + "grad_norm": 2.2787370681762695, + "learning_rate": 1e-06, + "loss": 0.8861, + "mean_token_accuracy": 0.7227882146835327, + "num_tokens": 322807176.0, + "step": 12759 + }, + { + "epoch": 1.4012738853503186, + "grad_norm": 2.189073324203491, + "learning_rate": 1e-06, + "loss": 0.829, + "mean_token_accuracy": 0.7478854656219482, + "num_tokens": 322832362.0, + "step": 12760 + }, + { + "epoch": 1.4013837030529321, + "grad_norm": 2.1728715896606445, + "learning_rate": 1e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.7062628865242004, + "num_tokens": 322860825.0, + "step": 12761 + }, + { + "epoch": 1.4014935207555457, + "grad_norm": 2.3805885314941406, + "learning_rate": 1e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.7168326377868652, + "num_tokens": 322882603.0, + "step": 12762 + }, + { + "epoch": 1.4016033384581594, + "grad_norm": 2.2216033935546875, + "learning_rate": 1e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.7035913467407227, + "num_tokens": 322909428.0, + "step": 12763 + }, + { + "epoch": 1.4017131561607732, + "grad_norm": 2.442647695541382, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7127905488014221, + "num_tokens": 322931839.0, + "step": 12764 + }, + { + "epoch": 1.4018229738633867, + "grad_norm": 2.1726319789886475, + "learning_rate": 1e-06, + "loss": 0.8937, + "mean_token_accuracy": 0.7223346829414368, + "num_tokens": 322960554.0, + "step": 12765 + }, + { + "epoch": 1.4019327915660005, + "grad_norm": 2.503209352493286, + "learning_rate": 1e-06, + "loss": 0.8952, + "mean_token_accuracy": 0.7253702878952026, + "num_tokens": 322981058.0, + "step": 12766 + }, + { + "epoch": 1.402042609268614, + "grad_norm": 2.2997117042541504, + "learning_rate": 1e-06, + "loss": 0.7943, + "mean_token_accuracy": 0.7453274130821228, + "num_tokens": 323003081.0, + "step": 12767 + }, + { + "epoch": 1.4021524269712278, + "grad_norm": 2.3218159675598145, + "learning_rate": 1e-06, + "loss": 0.8919, + "mean_token_accuracy": 0.7233667373657227, + "num_tokens": 323026402.0, + "step": 12768 + }, + { + "epoch": 1.4022622446738415, + "grad_norm": 2.145597457885742, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7213350534439087, + "num_tokens": 323051916.0, + "step": 12769 + }, + { + "epoch": 1.402372062376455, + "grad_norm": 2.2756147384643555, + "learning_rate": 1e-06, + "loss": 0.8826, + "mean_token_accuracy": 0.7242788672447205, + "num_tokens": 323076164.0, + "step": 12770 + }, + { + "epoch": 1.4024818800790688, + "grad_norm": 2.188480854034424, + "learning_rate": 1e-06, + "loss": 0.985, + "mean_token_accuracy": 0.7023414969444275, + "num_tokens": 323102338.0, + "step": 12771 + }, + { + "epoch": 1.4025916977816824, + "grad_norm": 2.4390923976898193, + "learning_rate": 1e-06, + "loss": 0.8778, + "mean_token_accuracy": 0.737555205821991, + "num_tokens": 323124964.0, + "step": 12772 + }, + { + "epoch": 1.4027015154842961, + "grad_norm": 2.4452826976776123, + "learning_rate": 1e-06, + "loss": 0.8744, + "mean_token_accuracy": 0.7246836423873901, + "num_tokens": 323147550.0, + "step": 12773 + }, + { + "epoch": 1.4028113331869096, + "grad_norm": 1.9971634149551392, + "learning_rate": 1e-06, + "loss": 0.8428, + "mean_token_accuracy": 0.7337885499000549, + "num_tokens": 323176590.0, + "step": 12774 + }, + { + "epoch": 1.4029211508895234, + "grad_norm": 2.2354822158813477, + "learning_rate": 1e-06, + "loss": 0.8583, + "mean_token_accuracy": 0.7306118011474609, + "num_tokens": 323201298.0, + "step": 12775 + }, + { + "epoch": 1.403030968592137, + "grad_norm": 2.3023476600646973, + "learning_rate": 1e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7104116678237915, + "num_tokens": 323226316.0, + "step": 12776 + }, + { + "epoch": 1.4031407862947507, + "grad_norm": 2.198469638824463, + "learning_rate": 1e-06, + "loss": 0.8513, + "mean_token_accuracy": 0.748998761177063, + "num_tokens": 323251205.0, + "step": 12777 + }, + { + "epoch": 1.4032506039973645, + "grad_norm": 2.163836717605591, + "learning_rate": 1e-06, + "loss": 0.8972, + "mean_token_accuracy": 0.7175564169883728, + "num_tokens": 323277797.0, + "step": 12778 + }, + { + "epoch": 1.403360421699978, + "grad_norm": 2.0145788192749023, + "learning_rate": 1e-06, + "loss": 0.9949, + "mean_token_accuracy": 0.7035006284713745, + "num_tokens": 323308700.0, + "step": 12779 + }, + { + "epoch": 1.4034702394025917, + "grad_norm": 1.94463050365448, + "learning_rate": 1e-06, + "loss": 1.0091, + "mean_token_accuracy": 0.6876667737960815, + "num_tokens": 323342033.0, + "step": 12780 + }, + { + "epoch": 1.4035800571052053, + "grad_norm": 1.977413296699524, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7201271057128906, + "num_tokens": 323372620.0, + "step": 12781 + }, + { + "epoch": 1.403689874807819, + "grad_norm": 2.003363609313965, + "learning_rate": 1e-06, + "loss": 0.9843, + "mean_token_accuracy": 0.6964502334594727, + "num_tokens": 323402497.0, + "step": 12782 + }, + { + "epoch": 1.4037996925104328, + "grad_norm": 2.4063720703125, + "learning_rate": 1e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.7211055755615234, + "num_tokens": 323425405.0, + "step": 12783 + }, + { + "epoch": 1.4039095102130463, + "grad_norm": 2.1747148036956787, + "learning_rate": 1e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.698968768119812, + "num_tokens": 323453691.0, + "step": 12784 + }, + { + "epoch": 1.40401932791566, + "grad_norm": 2.0964975357055664, + "learning_rate": 1e-06, + "loss": 0.9504, + "mean_token_accuracy": 0.7013357877731323, + "num_tokens": 323481476.0, + "step": 12785 + }, + { + "epoch": 1.4041291456182736, + "grad_norm": 2.0264933109283447, + "learning_rate": 1e-06, + "loss": 0.962, + "mean_token_accuracy": 0.6986097097396851, + "num_tokens": 323512708.0, + "step": 12786 + }, + { + "epoch": 1.4042389633208874, + "grad_norm": 2.3898730278015137, + "learning_rate": 1e-06, + "loss": 0.8685, + "mean_token_accuracy": 0.7269203662872314, + "num_tokens": 323535087.0, + "step": 12787 + }, + { + "epoch": 1.404348781023501, + "grad_norm": 2.41196608543396, + "learning_rate": 1e-06, + "loss": 0.8359, + "mean_token_accuracy": 0.739469587802887, + "num_tokens": 323557099.0, + "step": 12788 + }, + { + "epoch": 1.4044585987261147, + "grad_norm": 2.0685901641845703, + "learning_rate": 1e-06, + "loss": 0.8867, + "mean_token_accuracy": 0.7190656661987305, + "num_tokens": 323584394.0, + "step": 12789 + }, + { + "epoch": 1.4045684164287282, + "grad_norm": 2.1906731128692627, + "learning_rate": 1e-06, + "loss": 0.874, + "mean_token_accuracy": 0.7273939847946167, + "num_tokens": 323611607.0, + "step": 12790 + }, + { + "epoch": 1.404678234131342, + "grad_norm": 2.487344980239868, + "learning_rate": 1e-06, + "loss": 0.8886, + "mean_token_accuracy": 0.7188270688056946, + "num_tokens": 323631958.0, + "step": 12791 + }, + { + "epoch": 1.4047880518339557, + "grad_norm": 2.133970022201538, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7131901383399963, + "num_tokens": 323660363.0, + "step": 12792 + }, + { + "epoch": 1.4048978695365693, + "grad_norm": 2.4718539714813232, + "learning_rate": 1e-06, + "loss": 0.8645, + "mean_token_accuracy": 0.7301477193832397, + "num_tokens": 323682028.0, + "step": 12793 + }, + { + "epoch": 1.405007687239183, + "grad_norm": 2.585376024246216, + "learning_rate": 1e-06, + "loss": 0.8702, + "mean_token_accuracy": 0.7301957607269287, + "num_tokens": 323702553.0, + "step": 12794 + }, + { + "epoch": 1.4051175049417965, + "grad_norm": 2.2509119510650635, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.7162133455276489, + "num_tokens": 323728945.0, + "step": 12795 + }, + { + "epoch": 1.4052273226444103, + "grad_norm": 2.280494451522827, + "learning_rate": 1e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.7219830751419067, + "num_tokens": 323754465.0, + "step": 12796 + }, + { + "epoch": 1.405337140347024, + "grad_norm": 2.5438263416290283, + "learning_rate": 1e-06, + "loss": 0.8448, + "mean_token_accuracy": 0.7277520298957825, + "num_tokens": 323774595.0, + "step": 12797 + }, + { + "epoch": 1.4054469580496376, + "grad_norm": 2.6511499881744385, + "learning_rate": 1e-06, + "loss": 0.8762, + "mean_token_accuracy": 0.726256787776947, + "num_tokens": 323794396.0, + "step": 12798 + }, + { + "epoch": 1.4055567757522511, + "grad_norm": 2.2271299362182617, + "learning_rate": 1e-06, + "loss": 0.9873, + "mean_token_accuracy": 0.6988799571990967, + "num_tokens": 323823047.0, + "step": 12799 + }, + { + "epoch": 1.4056665934548649, + "grad_norm": 2.5508673191070557, + "learning_rate": 1e-06, + "loss": 0.8405, + "mean_token_accuracy": 0.7326864004135132, + "num_tokens": 323842871.0, + "step": 12800 + }, + { + "epoch": 1.4057764111574786, + "grad_norm": 2.343736171722412, + "learning_rate": 1e-06, + "loss": 0.9224, + "mean_token_accuracy": 0.7115322351455688, + "num_tokens": 323867788.0, + "step": 12801 + }, + { + "epoch": 1.4058862288600922, + "grad_norm": 2.1675057411193848, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7078075408935547, + "num_tokens": 323896135.0, + "step": 12802 + }, + { + "epoch": 1.405996046562706, + "grad_norm": 2.4703948497772217, + "learning_rate": 1e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.712378740310669, + "num_tokens": 323918378.0, + "step": 12803 + }, + { + "epoch": 1.4061058642653195, + "grad_norm": 2.171168565750122, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7154147028923035, + "num_tokens": 323946304.0, + "step": 12804 + }, + { + "epoch": 1.4062156819679332, + "grad_norm": 2.008997678756714, + "learning_rate": 1e-06, + "loss": 0.861, + "mean_token_accuracy": 0.7276812195777893, + "num_tokens": 323975390.0, + "step": 12805 + }, + { + "epoch": 1.406325499670547, + "grad_norm": 2.5004827976226807, + "learning_rate": 1e-06, + "loss": 0.8693, + "mean_token_accuracy": 0.729661226272583, + "num_tokens": 323996408.0, + "step": 12806 + }, + { + "epoch": 1.4064353173731605, + "grad_norm": 2.571275234222412, + "learning_rate": 1e-06, + "loss": 0.8459, + "mean_token_accuracy": 0.7409636974334717, + "num_tokens": 324017162.0, + "step": 12807 + }, + { + "epoch": 1.4065451350757743, + "grad_norm": 2.083303451538086, + "learning_rate": 1e-06, + "loss": 0.8928, + "mean_token_accuracy": 0.7185364961624146, + "num_tokens": 324044044.0, + "step": 12808 + }, + { + "epoch": 1.4066549527783878, + "grad_norm": 2.259467601776123, + "learning_rate": 1e-06, + "loss": 0.8775, + "mean_token_accuracy": 0.720843493938446, + "num_tokens": 324068453.0, + "step": 12809 + }, + { + "epoch": 1.4067647704810016, + "grad_norm": 2.6036436557769775, + "learning_rate": 1e-06, + "loss": 0.829, + "mean_token_accuracy": 0.7400767207145691, + "num_tokens": 324088528.0, + "step": 12810 + }, + { + "epoch": 1.4068745881836153, + "grad_norm": 2.219679355621338, + "learning_rate": 1e-06, + "loss": 0.966, + "mean_token_accuracy": 0.6940184831619263, + "num_tokens": 324115978.0, + "step": 12811 + }, + { + "epoch": 1.4069844058862289, + "grad_norm": 2.519181966781616, + "learning_rate": 1e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7231017351150513, + "num_tokens": 324138714.0, + "step": 12812 + }, + { + "epoch": 1.4070942235888424, + "grad_norm": 2.2962138652801514, + "learning_rate": 1e-06, + "loss": 0.8672, + "mean_token_accuracy": 0.7228791117668152, + "num_tokens": 324162704.0, + "step": 12813 + }, + { + "epoch": 1.4072040412914562, + "grad_norm": 2.309950590133667, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.7099325656890869, + "num_tokens": 324189807.0, + "step": 12814 + }, + { + "epoch": 1.40731385899407, + "grad_norm": 2.4035215377807617, + "learning_rate": 1e-06, + "loss": 0.893, + "mean_token_accuracy": 0.7226338386535645, + "num_tokens": 324213530.0, + "step": 12815 + }, + { + "epoch": 1.4074236766966834, + "grad_norm": 2.1671457290649414, + "learning_rate": 1e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.7164731025695801, + "num_tokens": 324241640.0, + "step": 12816 + }, + { + "epoch": 1.4075334943992972, + "grad_norm": 2.4194822311401367, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7172186970710754, + "num_tokens": 324265397.0, + "step": 12817 + }, + { + "epoch": 1.4076433121019107, + "grad_norm": 2.0984909534454346, + "learning_rate": 1e-06, + "loss": 0.8825, + "mean_token_accuracy": 0.7208667993545532, + "num_tokens": 324293035.0, + "step": 12818 + }, + { + "epoch": 1.4077531298045245, + "grad_norm": 2.0632948875427246, + "learning_rate": 1e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7041090726852417, + "num_tokens": 324324083.0, + "step": 12819 + }, + { + "epoch": 1.4078629475071383, + "grad_norm": 2.43359375, + "learning_rate": 1e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.7243480682373047, + "num_tokens": 324344739.0, + "step": 12820 + }, + { + "epoch": 1.4079727652097518, + "grad_norm": 2.323577404022217, + "learning_rate": 1e-06, + "loss": 0.8226, + "mean_token_accuracy": 0.7389241456985474, + "num_tokens": 324366453.0, + "step": 12821 + }, + { + "epoch": 1.4080825829123655, + "grad_norm": 2.176780939102173, + "learning_rate": 1e-06, + "loss": 0.8924, + "mean_token_accuracy": 0.7316083908081055, + "num_tokens": 324391091.0, + "step": 12822 + }, + { + "epoch": 1.408192400614979, + "grad_norm": 2.231454610824585, + "learning_rate": 1e-06, + "loss": 0.8019, + "mean_token_accuracy": 0.744242787361145, + "num_tokens": 324414155.0, + "step": 12823 + }, + { + "epoch": 1.4083022183175928, + "grad_norm": 2.5300660133361816, + "learning_rate": 1e-06, + "loss": 0.8234, + "mean_token_accuracy": 0.7415958046913147, + "num_tokens": 324435619.0, + "step": 12824 + }, + { + "epoch": 1.4084120360202066, + "grad_norm": 2.7439308166503906, + "learning_rate": 1e-06, + "loss": 0.8518, + "mean_token_accuracy": 0.7323980331420898, + "num_tokens": 324453541.0, + "step": 12825 + }, + { + "epoch": 1.4085218537228201, + "grad_norm": 2.220656633377075, + "learning_rate": 1e-06, + "loss": 0.8978, + "mean_token_accuracy": 0.7194012403488159, + "num_tokens": 324478459.0, + "step": 12826 + }, + { + "epoch": 1.4086316714254337, + "grad_norm": 2.4529881477355957, + "learning_rate": 1e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.7113658785820007, + "num_tokens": 324501109.0, + "step": 12827 + }, + { + "epoch": 1.4087414891280474, + "grad_norm": 2.232431650161743, + "learning_rate": 1e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.7175713777542114, + "num_tokens": 324527509.0, + "step": 12828 + }, + { + "epoch": 1.4088513068306612, + "grad_norm": 1.9679059982299805, + "learning_rate": 1e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.7092082500457764, + "num_tokens": 324561515.0, + "step": 12829 + }, + { + "epoch": 1.4089611245332747, + "grad_norm": 2.354513645172119, + "learning_rate": 1e-06, + "loss": 0.8412, + "mean_token_accuracy": 0.7393625974655151, + "num_tokens": 324584820.0, + "step": 12830 + }, + { + "epoch": 1.4090709422358885, + "grad_norm": 2.2370753288269043, + "learning_rate": 1e-06, + "loss": 0.9358, + "mean_token_accuracy": 0.7061500549316406, + "num_tokens": 324610919.0, + "step": 12831 + }, + { + "epoch": 1.409180759938502, + "grad_norm": 2.3072309494018555, + "learning_rate": 1e-06, + "loss": 0.8881, + "mean_token_accuracy": 0.7190006971359253, + "num_tokens": 324634888.0, + "step": 12832 + }, + { + "epoch": 1.4092905776411158, + "grad_norm": 2.0432698726654053, + "learning_rate": 1e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.7125102281570435, + "num_tokens": 324664285.0, + "step": 12833 + }, + { + "epoch": 1.4094003953437295, + "grad_norm": 2.5855162143707275, + "learning_rate": 1e-06, + "loss": 0.8616, + "mean_token_accuracy": 0.7263895869255066, + "num_tokens": 324683880.0, + "step": 12834 + }, + { + "epoch": 1.409510213046343, + "grad_norm": 2.2889413833618164, + "learning_rate": 1e-06, + "loss": 0.8477, + "mean_token_accuracy": 0.7380330562591553, + "num_tokens": 324707836.0, + "step": 12835 + }, + { + "epoch": 1.4096200307489568, + "grad_norm": 2.154477596282959, + "learning_rate": 1e-06, + "loss": 0.8758, + "mean_token_accuracy": 0.7297225594520569, + "num_tokens": 324733789.0, + "step": 12836 + }, + { + "epoch": 1.4097298484515703, + "grad_norm": 2.4528605937957764, + "learning_rate": 1e-06, + "loss": 0.905, + "mean_token_accuracy": 0.7190968990325928, + "num_tokens": 324755697.0, + "step": 12837 + }, + { + "epoch": 1.409839666154184, + "grad_norm": 2.030379056930542, + "learning_rate": 1e-06, + "loss": 0.9183, + "mean_token_accuracy": 0.7175697088241577, + "num_tokens": 324785699.0, + "step": 12838 + }, + { + "epoch": 1.4099494838567976, + "grad_norm": 2.2181363105773926, + "learning_rate": 1e-06, + "loss": 0.8279, + "mean_token_accuracy": 0.7475176453590393, + "num_tokens": 324810138.0, + "step": 12839 + }, + { + "epoch": 1.4100593015594114, + "grad_norm": 2.068057060241699, + "learning_rate": 1e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.7001945972442627, + "num_tokens": 324840579.0, + "step": 12840 + }, + { + "epoch": 1.410169119262025, + "grad_norm": 2.296149492263794, + "learning_rate": 1e-06, + "loss": 0.864, + "mean_token_accuracy": 0.7293585538864136, + "num_tokens": 324864908.0, + "step": 12841 + }, + { + "epoch": 1.4102789369646387, + "grad_norm": 2.049551486968994, + "learning_rate": 1e-06, + "loss": 0.9409, + "mean_token_accuracy": 0.70329350233078, + "num_tokens": 324896803.0, + "step": 12842 + }, + { + "epoch": 1.4103887546672524, + "grad_norm": 2.10898494720459, + "learning_rate": 1e-06, + "loss": 0.9939, + "mean_token_accuracy": 0.7028816938400269, + "num_tokens": 324924399.0, + "step": 12843 + }, + { + "epoch": 1.410498572369866, + "grad_norm": 2.222416400909424, + "learning_rate": 1e-06, + "loss": 0.9504, + "mean_token_accuracy": 0.7042281627655029, + "num_tokens": 324951004.0, + "step": 12844 + }, + { + "epoch": 1.4106083900724797, + "grad_norm": 2.030241012573242, + "learning_rate": 1e-06, + "loss": 0.9804, + "mean_token_accuracy": 0.7025922536849976, + "num_tokens": 324980469.0, + "step": 12845 + }, + { + "epoch": 1.4107182077750933, + "grad_norm": 2.273550033569336, + "learning_rate": 1e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.7105877995491028, + "num_tokens": 325007510.0, + "step": 12846 + }, + { + "epoch": 1.410828025477707, + "grad_norm": 2.0237298011779785, + "learning_rate": 1e-06, + "loss": 0.9568, + "mean_token_accuracy": 0.7104659080505371, + "num_tokens": 325037496.0, + "step": 12847 + }, + { + "epoch": 1.4109378431803208, + "grad_norm": 2.3014960289001465, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7034266591072083, + "num_tokens": 325063155.0, + "step": 12848 + }, + { + "epoch": 1.4110476608829343, + "grad_norm": 2.15444278717041, + "learning_rate": 1e-06, + "loss": 0.8996, + "mean_token_accuracy": 0.7147189378738403, + "num_tokens": 325092378.0, + "step": 12849 + }, + { + "epoch": 1.4111574785855479, + "grad_norm": 2.1332175731658936, + "learning_rate": 1e-06, + "loss": 1.0137, + "mean_token_accuracy": 0.697266697883606, + "num_tokens": 325121862.0, + "step": 12850 + }, + { + "epoch": 1.4112672962881616, + "grad_norm": 2.32967472076416, + "learning_rate": 1e-06, + "loss": 0.872, + "mean_token_accuracy": 0.7220410108566284, + "num_tokens": 325146322.0, + "step": 12851 + }, + { + "epoch": 1.4113771139907754, + "grad_norm": 2.5073821544647217, + "learning_rate": 1e-06, + "loss": 0.8499, + "mean_token_accuracy": 0.7324890494346619, + "num_tokens": 325170611.0, + "step": 12852 + }, + { + "epoch": 1.411486931693389, + "grad_norm": 2.2516448497772217, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7137320637702942, + "num_tokens": 325198745.0, + "step": 12853 + }, + { + "epoch": 1.4115967493960027, + "grad_norm": 2.491178512573242, + "learning_rate": 1e-06, + "loss": 0.8332, + "mean_token_accuracy": 0.7403256893157959, + "num_tokens": 325219512.0, + "step": 12854 + }, + { + "epoch": 1.4117065670986162, + "grad_norm": 2.134945869445801, + "learning_rate": 1e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.7312672138214111, + "num_tokens": 325247631.0, + "step": 12855 + }, + { + "epoch": 1.41181638480123, + "grad_norm": 1.9864881038665771, + "learning_rate": 1e-06, + "loss": 0.922, + "mean_token_accuracy": 0.7158340215682983, + "num_tokens": 325280658.0, + "step": 12856 + }, + { + "epoch": 1.4119262025038437, + "grad_norm": 1.9109227657318115, + "learning_rate": 1e-06, + "loss": 0.8641, + "mean_token_accuracy": 0.7314932346343994, + "num_tokens": 325314292.0, + "step": 12857 + }, + { + "epoch": 1.4120360202064572, + "grad_norm": 2.4434802532196045, + "learning_rate": 1e-06, + "loss": 0.8535, + "mean_token_accuracy": 0.7342998385429382, + "num_tokens": 325335160.0, + "step": 12858 + }, + { + "epoch": 1.412145837909071, + "grad_norm": 2.331782817840576, + "learning_rate": 1e-06, + "loss": 0.8921, + "mean_token_accuracy": 0.7205063700675964, + "num_tokens": 325360141.0, + "step": 12859 + }, + { + "epoch": 1.4122556556116845, + "grad_norm": 2.0858819484710693, + "learning_rate": 1e-06, + "loss": 1.0009, + "mean_token_accuracy": 0.6883952617645264, + "num_tokens": 325392052.0, + "step": 12860 + }, + { + "epoch": 1.4123654733142983, + "grad_norm": 2.3097400665283203, + "learning_rate": 1e-06, + "loss": 0.9218, + "mean_token_accuracy": 0.7182428240776062, + "num_tokens": 325418206.0, + "step": 12861 + }, + { + "epoch": 1.412475291016912, + "grad_norm": 2.3013105392456055, + "learning_rate": 1e-06, + "loss": 0.984, + "mean_token_accuracy": 0.7092099189758301, + "num_tokens": 325443015.0, + "step": 12862 + }, + { + "epoch": 1.4125851087195256, + "grad_norm": 2.35858416557312, + "learning_rate": 1e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.7173798680305481, + "num_tokens": 325466489.0, + "step": 12863 + }, + { + "epoch": 1.4126949264221391, + "grad_norm": 2.1778242588043213, + "learning_rate": 1e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.7165273427963257, + "num_tokens": 325493247.0, + "step": 12864 + }, + { + "epoch": 1.4128047441247529, + "grad_norm": 2.5571799278259277, + "learning_rate": 1e-06, + "loss": 0.9058, + "mean_token_accuracy": 0.721102237701416, + "num_tokens": 325513998.0, + "step": 12865 + }, + { + "epoch": 1.4129145618273666, + "grad_norm": 2.2417569160461426, + "learning_rate": 1e-06, + "loss": 0.876, + "mean_token_accuracy": 0.7227124571800232, + "num_tokens": 325538391.0, + "step": 12866 + }, + { + "epoch": 1.4130243795299802, + "grad_norm": 2.3627634048461914, + "learning_rate": 1e-06, + "loss": 0.8628, + "mean_token_accuracy": 0.7320849895477295, + "num_tokens": 325563110.0, + "step": 12867 + }, + { + "epoch": 1.413134197232594, + "grad_norm": 2.223602056503296, + "learning_rate": 1e-06, + "loss": 0.881, + "mean_token_accuracy": 0.7310296297073364, + "num_tokens": 325590991.0, + "step": 12868 + }, + { + "epoch": 1.4132440149352075, + "grad_norm": 1.8589041233062744, + "learning_rate": 1e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7165787816047668, + "num_tokens": 325629924.0, + "step": 12869 + }, + { + "epoch": 1.4133538326378212, + "grad_norm": 1.9717110395431519, + "learning_rate": 1e-06, + "loss": 1.0184, + "mean_token_accuracy": 0.6878841519355774, + "num_tokens": 325664529.0, + "step": 12870 + }, + { + "epoch": 1.413463650340435, + "grad_norm": 2.2991867065429688, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.7088987827301025, + "num_tokens": 325690197.0, + "step": 12871 + }, + { + "epoch": 1.4135734680430485, + "grad_norm": 2.557687759399414, + "learning_rate": 1e-06, + "loss": 0.927, + "mean_token_accuracy": 0.7158740758895874, + "num_tokens": 325710449.0, + "step": 12872 + }, + { + "epoch": 1.4136832857456623, + "grad_norm": 2.3294787406921387, + "learning_rate": 1e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.7260617017745972, + "num_tokens": 325734588.0, + "step": 12873 + }, + { + "epoch": 1.4137931034482758, + "grad_norm": 2.088747024536133, + "learning_rate": 1e-06, + "loss": 0.8276, + "mean_token_accuracy": 0.7470460534095764, + "num_tokens": 325762082.0, + "step": 12874 + }, + { + "epoch": 1.4139029211508896, + "grad_norm": 2.391923427581787, + "learning_rate": 1e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.7283587455749512, + "num_tokens": 325784498.0, + "step": 12875 + }, + { + "epoch": 1.4140127388535033, + "grad_norm": 2.3611631393432617, + "learning_rate": 1e-06, + "loss": 0.8622, + "mean_token_accuracy": 0.730634331703186, + "num_tokens": 325806918.0, + "step": 12876 + }, + { + "epoch": 1.4141225565561168, + "grad_norm": 1.976435661315918, + "learning_rate": 1e-06, + "loss": 0.8999, + "mean_token_accuracy": 0.7293576002120972, + "num_tokens": 325836451.0, + "step": 12877 + }, + { + "epoch": 1.4142323742587304, + "grad_norm": 2.4050657749176025, + "learning_rate": 1e-06, + "loss": 0.881, + "mean_token_accuracy": 0.7267030477523804, + "num_tokens": 325860537.0, + "step": 12878 + }, + { + "epoch": 1.4143421919613441, + "grad_norm": 2.2960381507873535, + "learning_rate": 1e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.7108093500137329, + "num_tokens": 325886604.0, + "step": 12879 + }, + { + "epoch": 1.414452009663958, + "grad_norm": 2.1337993144989014, + "learning_rate": 1e-06, + "loss": 0.875, + "mean_token_accuracy": 0.7356295585632324, + "num_tokens": 325912066.0, + "step": 12880 + }, + { + "epoch": 1.4145618273665714, + "grad_norm": 1.879067301750183, + "learning_rate": 1e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.7010077238082886, + "num_tokens": 325945863.0, + "step": 12881 + }, + { + "epoch": 1.4146716450691852, + "grad_norm": 2.1644697189331055, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7127343416213989, + "num_tokens": 325972605.0, + "step": 12882 + }, + { + "epoch": 1.4147814627717987, + "grad_norm": 2.564039707183838, + "learning_rate": 1e-06, + "loss": 0.8785, + "mean_token_accuracy": 0.7194939255714417, + "num_tokens": 325993445.0, + "step": 12883 + }, + { + "epoch": 1.4148912804744125, + "grad_norm": 1.9450712203979492, + "learning_rate": 1e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.7065973877906799, + "num_tokens": 326026218.0, + "step": 12884 + }, + { + "epoch": 1.4150010981770262, + "grad_norm": 2.3942363262176514, + "learning_rate": 1e-06, + "loss": 0.8413, + "mean_token_accuracy": 0.7360780239105225, + "num_tokens": 326049823.0, + "step": 12885 + }, + { + "epoch": 1.4151109158796398, + "grad_norm": 2.0914509296417236, + "learning_rate": 1e-06, + "loss": 0.9768, + "mean_token_accuracy": 0.704371988773346, + "num_tokens": 326080810.0, + "step": 12886 + }, + { + "epoch": 1.4152207335822535, + "grad_norm": 2.2701709270477295, + "learning_rate": 1e-06, + "loss": 0.8781, + "mean_token_accuracy": 0.7238339781761169, + "num_tokens": 326106416.0, + "step": 12887 + }, + { + "epoch": 1.415330551284867, + "grad_norm": 2.163933038711548, + "learning_rate": 1e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.716448962688446, + "num_tokens": 326133799.0, + "step": 12888 + }, + { + "epoch": 1.4154403689874808, + "grad_norm": 2.0862696170806885, + "learning_rate": 1e-06, + "loss": 0.7872, + "mean_token_accuracy": 0.7566804885864258, + "num_tokens": 326161673.0, + "step": 12889 + }, + { + "epoch": 1.4155501866900946, + "grad_norm": 2.2444324493408203, + "learning_rate": 1e-06, + "loss": 0.8584, + "mean_token_accuracy": 0.732811450958252, + "num_tokens": 326186585.0, + "step": 12890 + }, + { + "epoch": 1.4156600043927081, + "grad_norm": 2.361581563949585, + "learning_rate": 1e-06, + "loss": 0.8727, + "mean_token_accuracy": 0.7368031740188599, + "num_tokens": 326210299.0, + "step": 12891 + }, + { + "epoch": 1.4157698220953217, + "grad_norm": 1.8501790761947632, + "learning_rate": 1e-06, + "loss": 0.8688, + "mean_token_accuracy": 0.726270854473114, + "num_tokens": 326244485.0, + "step": 12892 + }, + { + "epoch": 1.4158796397979354, + "grad_norm": 2.3055334091186523, + "learning_rate": 1e-06, + "loss": 0.9058, + "mean_token_accuracy": 0.7156496644020081, + "num_tokens": 326268060.0, + "step": 12893 + }, + { + "epoch": 1.4159894575005492, + "grad_norm": 2.1616175174713135, + "learning_rate": 1e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.7136557102203369, + "num_tokens": 326295694.0, + "step": 12894 + }, + { + "epoch": 1.4160992752031627, + "grad_norm": 2.4207041263580322, + "learning_rate": 1e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.7148586511611938, + "num_tokens": 326319430.0, + "step": 12895 + }, + { + "epoch": 1.4162090929057765, + "grad_norm": 2.442840337753296, + "learning_rate": 1e-06, + "loss": 0.9356, + "mean_token_accuracy": 0.7141069769859314, + "num_tokens": 326341096.0, + "step": 12896 + }, + { + "epoch": 1.41631891060839, + "grad_norm": 2.058154582977295, + "learning_rate": 1e-06, + "loss": 0.9375, + "mean_token_accuracy": 0.7119884490966797, + "num_tokens": 326370227.0, + "step": 12897 + }, + { + "epoch": 1.4164287283110037, + "grad_norm": 2.5885369777679443, + "learning_rate": 1e-06, + "loss": 0.819, + "mean_token_accuracy": 0.7410444021224976, + "num_tokens": 326389177.0, + "step": 12898 + }, + { + "epoch": 1.4165385460136175, + "grad_norm": 2.0195372104644775, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7067945599555969, + "num_tokens": 326422525.0, + "step": 12899 + }, + { + "epoch": 1.416648363716231, + "grad_norm": 2.2743871212005615, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.7018909454345703, + "num_tokens": 326447395.0, + "step": 12900 + }, + { + "epoch": 1.4167581814188448, + "grad_norm": 2.1685502529144287, + "learning_rate": 1e-06, + "loss": 0.9041, + "mean_token_accuracy": 0.7212517261505127, + "num_tokens": 326474184.0, + "step": 12901 + }, + { + "epoch": 1.4168679991214583, + "grad_norm": 2.0031704902648926, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7066152095794678, + "num_tokens": 326507128.0, + "step": 12902 + }, + { + "epoch": 1.416977816824072, + "grad_norm": 2.2434349060058594, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7069269418716431, + "num_tokens": 326535282.0, + "step": 12903 + }, + { + "epoch": 1.4170876345266856, + "grad_norm": 2.4216387271881104, + "learning_rate": 1e-06, + "loss": 0.8502, + "mean_token_accuracy": 0.7316232919692993, + "num_tokens": 326556278.0, + "step": 12904 + }, + { + "epoch": 1.4171974522292994, + "grad_norm": 2.1034648418426514, + "learning_rate": 1e-06, + "loss": 0.8745, + "mean_token_accuracy": 0.7218244075775146, + "num_tokens": 326583119.0, + "step": 12905 + }, + { + "epoch": 1.417307269931913, + "grad_norm": 2.2335216999053955, + "learning_rate": 1e-06, + "loss": 0.8748, + "mean_token_accuracy": 0.7242284417152405, + "num_tokens": 326607944.0, + "step": 12906 + }, + { + "epoch": 1.4174170876345267, + "grad_norm": 2.19673228263855, + "learning_rate": 1e-06, + "loss": 0.7868, + "mean_token_accuracy": 0.7517217397689819, + "num_tokens": 326633885.0, + "step": 12907 + }, + { + "epoch": 1.4175269053371404, + "grad_norm": 2.3675358295440674, + "learning_rate": 1e-06, + "loss": 0.8274, + "mean_token_accuracy": 0.7401726841926575, + "num_tokens": 326654061.0, + "step": 12908 + }, + { + "epoch": 1.417636723039754, + "grad_norm": 2.3401269912719727, + "learning_rate": 1e-06, + "loss": 0.916, + "mean_token_accuracy": 0.712949275970459, + "num_tokens": 326677934.0, + "step": 12909 + }, + { + "epoch": 1.4177465407423677, + "grad_norm": 1.9347045421600342, + "learning_rate": 1e-06, + "loss": 0.935, + "mean_token_accuracy": 0.71649169921875, + "num_tokens": 326712121.0, + "step": 12910 + }, + { + "epoch": 1.4178563584449813, + "grad_norm": 2.122148275375366, + "learning_rate": 1e-06, + "loss": 0.8966, + "mean_token_accuracy": 0.7182076573371887, + "num_tokens": 326743086.0, + "step": 12911 + }, + { + "epoch": 1.417966176147595, + "grad_norm": 2.270332098007202, + "learning_rate": 1e-06, + "loss": 0.926, + "mean_token_accuracy": 0.7087467312812805, + "num_tokens": 326767726.0, + "step": 12912 + }, + { + "epoch": 1.4180759938502088, + "grad_norm": 2.510770320892334, + "learning_rate": 1e-06, + "loss": 0.9687, + "mean_token_accuracy": 0.7159466743469238, + "num_tokens": 326787868.0, + "step": 12913 + }, + { + "epoch": 1.4181858115528223, + "grad_norm": 2.2204298973083496, + "learning_rate": 1e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7105446457862854, + "num_tokens": 326817284.0, + "step": 12914 + }, + { + "epoch": 1.4182956292554358, + "grad_norm": 2.1468427181243896, + "learning_rate": 1e-06, + "loss": 0.9511, + "mean_token_accuracy": 0.7081605792045593, + "num_tokens": 326844879.0, + "step": 12915 + }, + { + "epoch": 1.4184054469580496, + "grad_norm": 2.2803585529327393, + "learning_rate": 1e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.7236723303794861, + "num_tokens": 326869029.0, + "step": 12916 + }, + { + "epoch": 1.4185152646606634, + "grad_norm": 2.280085325241089, + "learning_rate": 1e-06, + "loss": 0.8426, + "mean_token_accuracy": 0.732113242149353, + "num_tokens": 326893601.0, + "step": 12917 + }, + { + "epoch": 1.418625082363277, + "grad_norm": 2.2740097045898438, + "learning_rate": 1e-06, + "loss": 0.9003, + "mean_token_accuracy": 0.7164273262023926, + "num_tokens": 326919833.0, + "step": 12918 + }, + { + "epoch": 1.4187349000658906, + "grad_norm": 2.2561092376708984, + "learning_rate": 1e-06, + "loss": 0.9117, + "mean_token_accuracy": 0.7234582304954529, + "num_tokens": 326946096.0, + "step": 12919 + }, + { + "epoch": 1.4188447177685042, + "grad_norm": 2.3759474754333496, + "learning_rate": 1e-06, + "loss": 0.7989, + "mean_token_accuracy": 0.7440503239631653, + "num_tokens": 326969456.0, + "step": 12920 + }, + { + "epoch": 1.418954535471118, + "grad_norm": 2.0943355560302734, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7105562090873718, + "num_tokens": 326997463.0, + "step": 12921 + }, + { + "epoch": 1.4190643531737317, + "grad_norm": 2.0533761978149414, + "learning_rate": 1e-06, + "loss": 0.8376, + "mean_token_accuracy": 0.7335488796234131, + "num_tokens": 327024464.0, + "step": 12922 + }, + { + "epoch": 1.4191741708763452, + "grad_norm": 2.2161672115325928, + "learning_rate": 1e-06, + "loss": 0.877, + "mean_token_accuracy": 0.7228000164031982, + "num_tokens": 327048967.0, + "step": 12923 + }, + { + "epoch": 1.419283988578959, + "grad_norm": 2.2910244464874268, + "learning_rate": 1e-06, + "loss": 1.0084, + "mean_token_accuracy": 0.6881568431854248, + "num_tokens": 327075434.0, + "step": 12924 + }, + { + "epoch": 1.4193938062815725, + "grad_norm": 2.1169066429138184, + "learning_rate": 1e-06, + "loss": 0.7992, + "mean_token_accuracy": 0.7382657527923584, + "num_tokens": 327101522.0, + "step": 12925 + }, + { + "epoch": 1.4195036239841863, + "grad_norm": 2.373296022415161, + "learning_rate": 1e-06, + "loss": 0.9051, + "mean_token_accuracy": 0.721400797367096, + "num_tokens": 327124459.0, + "step": 12926 + }, + { + "epoch": 1.4196134416868, + "grad_norm": 2.0658011436462402, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.717989981174469, + "num_tokens": 327153516.0, + "step": 12927 + }, + { + "epoch": 1.4197232593894136, + "grad_norm": 2.193394422531128, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7193113565444946, + "num_tokens": 327179460.0, + "step": 12928 + }, + { + "epoch": 1.419833077092027, + "grad_norm": 2.1815907955169678, + "learning_rate": 1e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.7161436080932617, + "num_tokens": 327205336.0, + "step": 12929 + }, + { + "epoch": 1.4199428947946409, + "grad_norm": 2.0648107528686523, + "learning_rate": 1e-06, + "loss": 0.9202, + "mean_token_accuracy": 0.7143163084983826, + "num_tokens": 327233781.0, + "step": 12930 + }, + { + "epoch": 1.4200527124972546, + "grad_norm": 2.1760268211364746, + "learning_rate": 1e-06, + "loss": 0.8716, + "mean_token_accuracy": 0.7309153079986572, + "num_tokens": 327258696.0, + "step": 12931 + }, + { + "epoch": 1.4201625301998682, + "grad_norm": 2.467742443084717, + "learning_rate": 1e-06, + "loss": 0.8573, + "mean_token_accuracy": 0.7281830310821533, + "num_tokens": 327279455.0, + "step": 12932 + }, + { + "epoch": 1.420272347902482, + "grad_norm": 2.050086736679077, + "learning_rate": 1e-06, + "loss": 1.0312, + "mean_token_accuracy": 0.6832415461540222, + "num_tokens": 327310645.0, + "step": 12933 + }, + { + "epoch": 1.4203821656050954, + "grad_norm": 2.233510971069336, + "learning_rate": 1e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.7152976393699646, + "num_tokens": 327337811.0, + "step": 12934 + }, + { + "epoch": 1.4204919833077092, + "grad_norm": 2.4648725986480713, + "learning_rate": 1e-06, + "loss": 0.8762, + "mean_token_accuracy": 0.7199350595474243, + "num_tokens": 327359125.0, + "step": 12935 + }, + { + "epoch": 1.420601801010323, + "grad_norm": 2.7201454639434814, + "learning_rate": 1e-06, + "loss": 0.8256, + "mean_token_accuracy": 0.739372193813324, + "num_tokens": 327378016.0, + "step": 12936 + }, + { + "epoch": 1.4207116187129365, + "grad_norm": 2.217125415802002, + "learning_rate": 1e-06, + "loss": 0.9103, + "mean_token_accuracy": 0.7167484760284424, + "num_tokens": 327408008.0, + "step": 12937 + }, + { + "epoch": 1.4208214364155503, + "grad_norm": 2.9879229068756104, + "learning_rate": 1e-06, + "loss": 0.9239, + "mean_token_accuracy": 0.71380615234375, + "num_tokens": 327425781.0, + "step": 12938 + }, + { + "epoch": 1.4209312541181638, + "grad_norm": 2.5983328819274902, + "learning_rate": 1e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.7135400772094727, + "num_tokens": 327447412.0, + "step": 12939 + }, + { + "epoch": 1.4210410718207775, + "grad_norm": 2.1468214988708496, + "learning_rate": 1e-06, + "loss": 0.9281, + "mean_token_accuracy": 0.7051330208778381, + "num_tokens": 327473855.0, + "step": 12940 + }, + { + "epoch": 1.4211508895233913, + "grad_norm": 2.1772096157073975, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7013460397720337, + "num_tokens": 327503132.0, + "step": 12941 + }, + { + "epoch": 1.4212607072260048, + "grad_norm": 2.3364810943603516, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7169535756111145, + "num_tokens": 327527257.0, + "step": 12942 + }, + { + "epoch": 1.4213705249286184, + "grad_norm": 2.2227091789245605, + "learning_rate": 1e-06, + "loss": 0.8719, + "mean_token_accuracy": 0.7273041009902954, + "num_tokens": 327554524.0, + "step": 12943 + }, + { + "epoch": 1.4214803426312321, + "grad_norm": 2.116438388824463, + "learning_rate": 1e-06, + "loss": 0.8295, + "mean_token_accuracy": 0.7393640875816345, + "num_tokens": 327581922.0, + "step": 12944 + }, + { + "epoch": 1.4215901603338459, + "grad_norm": 2.2889151573181152, + "learning_rate": 1e-06, + "loss": 0.7341, + "mean_token_accuracy": 0.7575258016586304, + "num_tokens": 327604032.0, + "step": 12945 + }, + { + "epoch": 1.4216999780364594, + "grad_norm": 2.44342041015625, + "learning_rate": 1e-06, + "loss": 1.0022, + "mean_token_accuracy": 0.6950812935829163, + "num_tokens": 327627851.0, + "step": 12946 + }, + { + "epoch": 1.4218097957390732, + "grad_norm": 2.4028146266937256, + "learning_rate": 1e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.7158218622207642, + "num_tokens": 327652508.0, + "step": 12947 + }, + { + "epoch": 1.4219196134416867, + "grad_norm": 2.437711477279663, + "learning_rate": 1e-06, + "loss": 0.8529, + "mean_token_accuracy": 0.7358802556991577, + "num_tokens": 327674642.0, + "step": 12948 + }, + { + "epoch": 1.4220294311443005, + "grad_norm": 2.3463070392608643, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7057393789291382, + "num_tokens": 327701844.0, + "step": 12949 + }, + { + "epoch": 1.4221392488469142, + "grad_norm": 2.505772590637207, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7031406164169312, + "num_tokens": 327724559.0, + "step": 12950 + }, + { + "epoch": 1.4222490665495278, + "grad_norm": 2.18533992767334, + "learning_rate": 1e-06, + "loss": 0.8835, + "mean_token_accuracy": 0.7243233323097229, + "num_tokens": 327749209.0, + "step": 12951 + }, + { + "epoch": 1.4223588842521415, + "grad_norm": 2.452563762664795, + "learning_rate": 1e-06, + "loss": 0.8573, + "mean_token_accuracy": 0.7291384935379028, + "num_tokens": 327770078.0, + "step": 12952 + }, + { + "epoch": 1.422468701954755, + "grad_norm": 2.386399269104004, + "learning_rate": 1e-06, + "loss": 0.927, + "mean_token_accuracy": 0.7207708954811096, + "num_tokens": 327793644.0, + "step": 12953 + }, + { + "epoch": 1.4225785196573688, + "grad_norm": 2.6546759605407715, + "learning_rate": 1e-06, + "loss": 0.806, + "mean_token_accuracy": 0.7411819100379944, + "num_tokens": 327811524.0, + "step": 12954 + }, + { + "epoch": 1.4226883373599823, + "grad_norm": 2.4805991649627686, + "learning_rate": 1e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.7071108222007751, + "num_tokens": 327833998.0, + "step": 12955 + }, + { + "epoch": 1.422798155062596, + "grad_norm": 2.27534818649292, + "learning_rate": 1e-06, + "loss": 0.9341, + "mean_token_accuracy": 0.7125269770622253, + "num_tokens": 327861887.0, + "step": 12956 + }, + { + "epoch": 1.4229079727652096, + "grad_norm": 2.328298807144165, + "learning_rate": 1e-06, + "loss": 0.8668, + "mean_token_accuracy": 0.7276891469955444, + "num_tokens": 327886363.0, + "step": 12957 + }, + { + "epoch": 1.4230177904678234, + "grad_norm": 2.2571957111358643, + "learning_rate": 1e-06, + "loss": 0.9356, + "mean_token_accuracy": 0.7094021439552307, + "num_tokens": 327912042.0, + "step": 12958 + }, + { + "epoch": 1.4231276081704372, + "grad_norm": 2.430931329727173, + "learning_rate": 1e-06, + "loss": 0.8949, + "mean_token_accuracy": 0.7241722345352173, + "num_tokens": 327934723.0, + "step": 12959 + }, + { + "epoch": 1.4232374258730507, + "grad_norm": 2.474782705307007, + "learning_rate": 1e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.7145218849182129, + "num_tokens": 327956723.0, + "step": 12960 + }, + { + "epoch": 1.4233472435756644, + "grad_norm": 2.200845718383789, + "learning_rate": 1e-06, + "loss": 0.8969, + "mean_token_accuracy": 0.715655505657196, + "num_tokens": 327983498.0, + "step": 12961 + }, + { + "epoch": 1.423457061278278, + "grad_norm": 2.309016227722168, + "learning_rate": 1e-06, + "loss": 0.8545, + "mean_token_accuracy": 0.728753387928009, + "num_tokens": 328008061.0, + "step": 12962 + }, + { + "epoch": 1.4235668789808917, + "grad_norm": 2.3970468044281006, + "learning_rate": 1e-06, + "loss": 0.8605, + "mean_token_accuracy": 0.7315802574157715, + "num_tokens": 328030433.0, + "step": 12963 + }, + { + "epoch": 1.4236766966835055, + "grad_norm": 2.2193896770477295, + "learning_rate": 1e-06, + "loss": 0.8581, + "mean_token_accuracy": 0.7329592108726501, + "num_tokens": 328055222.0, + "step": 12964 + }, + { + "epoch": 1.423786514386119, + "grad_norm": 2.423368215560913, + "learning_rate": 1e-06, + "loss": 0.9348, + "mean_token_accuracy": 0.7117562890052795, + "num_tokens": 328079225.0, + "step": 12965 + }, + { + "epoch": 1.4238963320887328, + "grad_norm": 2.5162789821624756, + "learning_rate": 1e-06, + "loss": 0.8189, + "mean_token_accuracy": 0.7425602674484253, + "num_tokens": 328100366.0, + "step": 12966 + }, + { + "epoch": 1.4240061497913463, + "grad_norm": 2.3949408531188965, + "learning_rate": 1e-06, + "loss": 0.8687, + "mean_token_accuracy": 0.7258577346801758, + "num_tokens": 328123799.0, + "step": 12967 + }, + { + "epoch": 1.42411596749396, + "grad_norm": 2.184131145477295, + "learning_rate": 1e-06, + "loss": 0.9493, + "mean_token_accuracy": 0.7131309509277344, + "num_tokens": 328151401.0, + "step": 12968 + }, + { + "epoch": 1.4242257851965736, + "grad_norm": 2.2787609100341797, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7104337215423584, + "num_tokens": 328176050.0, + "step": 12969 + }, + { + "epoch": 1.4243356028991874, + "grad_norm": 2.1696853637695312, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7003448009490967, + "num_tokens": 328202339.0, + "step": 12970 + }, + { + "epoch": 1.424445420601801, + "grad_norm": 2.4745430946350098, + "learning_rate": 1e-06, + "loss": 0.952, + "mean_token_accuracy": 0.7038558721542358, + "num_tokens": 328224970.0, + "step": 12971 + }, + { + "epoch": 1.4245552383044147, + "grad_norm": 2.2926104068756104, + "learning_rate": 1e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.7190297245979309, + "num_tokens": 328248211.0, + "step": 12972 + }, + { + "epoch": 1.4246650560070284, + "grad_norm": 2.5879969596862793, + "learning_rate": 1e-06, + "loss": 0.8698, + "mean_token_accuracy": 0.7283611297607422, + "num_tokens": 328268413.0, + "step": 12973 + }, + { + "epoch": 1.424774873709642, + "grad_norm": 1.9030189514160156, + "learning_rate": 1e-06, + "loss": 0.902, + "mean_token_accuracy": 0.7178785800933838, + "num_tokens": 328301411.0, + "step": 12974 + }, + { + "epoch": 1.4248846914122557, + "grad_norm": 2.479968786239624, + "learning_rate": 1e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.7211474180221558, + "num_tokens": 328325917.0, + "step": 12975 + }, + { + "epoch": 1.4249945091148692, + "grad_norm": 2.5619609355926514, + "learning_rate": 1e-06, + "loss": 0.9708, + "mean_token_accuracy": 0.7039947509765625, + "num_tokens": 328349566.0, + "step": 12976 + }, + { + "epoch": 1.425104326817483, + "grad_norm": 2.3400323390960693, + "learning_rate": 1e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.6997188925743103, + "num_tokens": 328374048.0, + "step": 12977 + }, + { + "epoch": 1.4252141445200968, + "grad_norm": 2.3447060585021973, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7111451625823975, + "num_tokens": 328398082.0, + "step": 12978 + }, + { + "epoch": 1.4253239622227103, + "grad_norm": 2.580294609069824, + "learning_rate": 1e-06, + "loss": 0.892, + "mean_token_accuracy": 0.7183683514595032, + "num_tokens": 328418395.0, + "step": 12979 + }, + { + "epoch": 1.4254337799253238, + "grad_norm": 2.4846951961517334, + "learning_rate": 1e-06, + "loss": 0.8542, + "mean_token_accuracy": 0.7290444374084473, + "num_tokens": 328440270.0, + "step": 12980 + }, + { + "epoch": 1.4255435976279376, + "grad_norm": 2.498110771179199, + "learning_rate": 1e-06, + "loss": 0.9174, + "mean_token_accuracy": 0.7131842374801636, + "num_tokens": 328461635.0, + "step": 12981 + }, + { + "epoch": 1.4256534153305513, + "grad_norm": 2.614068031311035, + "learning_rate": 1e-06, + "loss": 0.8454, + "mean_token_accuracy": 0.7320535182952881, + "num_tokens": 328482304.0, + "step": 12982 + }, + { + "epoch": 1.4257632330331649, + "grad_norm": 2.210326910018921, + "learning_rate": 1e-06, + "loss": 0.8723, + "mean_token_accuracy": 0.7254124879837036, + "num_tokens": 328509104.0, + "step": 12983 + }, + { + "epoch": 1.4258730507357786, + "grad_norm": 2.193777322769165, + "learning_rate": 1e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.7238401770591736, + "num_tokens": 328536107.0, + "step": 12984 + }, + { + "epoch": 1.4259828684383922, + "grad_norm": 2.2271220684051514, + "learning_rate": 1e-06, + "loss": 0.9792, + "mean_token_accuracy": 0.6970818042755127, + "num_tokens": 328563312.0, + "step": 12985 + }, + { + "epoch": 1.426092686141006, + "grad_norm": 2.0959646701812744, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.7092726826667786, + "num_tokens": 328594330.0, + "step": 12986 + }, + { + "epoch": 1.4262025038436197, + "grad_norm": 2.3066775798797607, + "learning_rate": 1e-06, + "loss": 0.8378, + "mean_token_accuracy": 0.7371937036514282, + "num_tokens": 328618964.0, + "step": 12987 + }, + { + "epoch": 1.4263123215462332, + "grad_norm": 2.294426202774048, + "learning_rate": 1e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.7300235033035278, + "num_tokens": 328643509.0, + "step": 12988 + }, + { + "epoch": 1.426422139248847, + "grad_norm": 2.5465683937072754, + "learning_rate": 1e-06, + "loss": 0.8573, + "mean_token_accuracy": 0.7290274500846863, + "num_tokens": 328664132.0, + "step": 12989 + }, + { + "epoch": 1.4265319569514605, + "grad_norm": 2.0901973247528076, + "learning_rate": 1e-06, + "loss": 0.9951, + "mean_token_accuracy": 0.7029192447662354, + "num_tokens": 328693592.0, + "step": 12990 + }, + { + "epoch": 1.4266417746540743, + "grad_norm": 2.4009623527526855, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.6928941011428833, + "num_tokens": 328718673.0, + "step": 12991 + }, + { + "epoch": 1.426751592356688, + "grad_norm": 2.3974783420562744, + "learning_rate": 1e-06, + "loss": 0.8794, + "mean_token_accuracy": 0.7302519679069519, + "num_tokens": 328741611.0, + "step": 12992 + }, + { + "epoch": 1.4268614100593016, + "grad_norm": 2.254743814468384, + "learning_rate": 1e-06, + "loss": 0.9024, + "mean_token_accuracy": 0.7299716472625732, + "num_tokens": 328766024.0, + "step": 12993 + }, + { + "epoch": 1.426971227761915, + "grad_norm": 2.1999928951263428, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.7117733955383301, + "num_tokens": 328791440.0, + "step": 12994 + }, + { + "epoch": 1.4270810454645289, + "grad_norm": 2.6605629920959473, + "learning_rate": 1e-06, + "loss": 0.8127, + "mean_token_accuracy": 0.7370185852050781, + "num_tokens": 328808765.0, + "step": 12995 + }, + { + "epoch": 1.4271908631671426, + "grad_norm": 2.460728406906128, + "learning_rate": 1e-06, + "loss": 0.8891, + "mean_token_accuracy": 0.7263132333755493, + "num_tokens": 328831351.0, + "step": 12996 + }, + { + "epoch": 1.4273006808697561, + "grad_norm": 2.2990593910217285, + "learning_rate": 1e-06, + "loss": 1.04, + "mean_token_accuracy": 0.676304042339325, + "num_tokens": 328858934.0, + "step": 12997 + }, + { + "epoch": 1.42741049857237, + "grad_norm": 2.1796324253082275, + "learning_rate": 1e-06, + "loss": 0.9386, + "mean_token_accuracy": 0.7195028066635132, + "num_tokens": 328888576.0, + "step": 12998 + }, + { + "epoch": 1.4275203162749834, + "grad_norm": 2.4213383197784424, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7091681957244873, + "num_tokens": 328912292.0, + "step": 12999 + }, + { + "epoch": 1.4276301339775972, + "grad_norm": 2.260796308517456, + "learning_rate": 1e-06, + "loss": 0.8554, + "mean_token_accuracy": 0.7299983501434326, + "num_tokens": 328936289.0, + "step": 13000 + }, + { + "epoch": 1.427739951680211, + "grad_norm": 2.0969812870025635, + "learning_rate": 1e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.7166438102722168, + "num_tokens": 328963812.0, + "step": 13001 + }, + { + "epoch": 1.4278497693828245, + "grad_norm": 2.2588961124420166, + "learning_rate": 1e-06, + "loss": 0.8713, + "mean_token_accuracy": 0.7326744198799133, + "num_tokens": 328987282.0, + "step": 13002 + }, + { + "epoch": 1.4279595870854382, + "grad_norm": 2.277880907058716, + "learning_rate": 1e-06, + "loss": 0.8612, + "mean_token_accuracy": 0.7279067635536194, + "num_tokens": 329010723.0, + "step": 13003 + }, + { + "epoch": 1.4280694047880518, + "grad_norm": 2.508042573928833, + "learning_rate": 1e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.7054635286331177, + "num_tokens": 329033802.0, + "step": 13004 + }, + { + "epoch": 1.4281792224906655, + "grad_norm": 2.147104024887085, + "learning_rate": 1e-06, + "loss": 0.9254, + "mean_token_accuracy": 0.7079432606697083, + "num_tokens": 329060606.0, + "step": 13005 + }, + { + "epoch": 1.4282890401932793, + "grad_norm": 2.3672285079956055, + "learning_rate": 1e-06, + "loss": 0.9586, + "mean_token_accuracy": 0.7109217643737793, + "num_tokens": 329085560.0, + "step": 13006 + }, + { + "epoch": 1.4283988578958928, + "grad_norm": 2.0131454467773438, + "learning_rate": 1e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.7118600010871887, + "num_tokens": 329115863.0, + "step": 13007 + }, + { + "epoch": 1.4285086755985064, + "grad_norm": 2.352738380432129, + "learning_rate": 1e-06, + "loss": 0.8743, + "mean_token_accuracy": 0.7234842777252197, + "num_tokens": 329138640.0, + "step": 13008 + }, + { + "epoch": 1.4286184933011201, + "grad_norm": 2.1556999683380127, + "learning_rate": 1e-06, + "loss": 0.9233, + "mean_token_accuracy": 0.712928295135498, + "num_tokens": 329166673.0, + "step": 13009 + }, + { + "epoch": 1.4287283110037339, + "grad_norm": 2.2623281478881836, + "learning_rate": 1e-06, + "loss": 0.9645, + "mean_token_accuracy": 0.6995848417282104, + "num_tokens": 329192791.0, + "step": 13010 + }, + { + "epoch": 1.4288381287063474, + "grad_norm": 2.2009644508361816, + "learning_rate": 1e-06, + "loss": 0.8757, + "mean_token_accuracy": 0.7275303602218628, + "num_tokens": 329219283.0, + "step": 13011 + }, + { + "epoch": 1.4289479464089612, + "grad_norm": 2.2239151000976562, + "learning_rate": 1e-06, + "loss": 0.9105, + "mean_token_accuracy": 0.7131558060646057, + "num_tokens": 329244657.0, + "step": 13012 + }, + { + "epoch": 1.4290577641115747, + "grad_norm": 2.1373250484466553, + "learning_rate": 1e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.6967240571975708, + "num_tokens": 329274618.0, + "step": 13013 + }, + { + "epoch": 1.4291675818141885, + "grad_norm": 2.3153934478759766, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7096304893493652, + "num_tokens": 329300226.0, + "step": 13014 + }, + { + "epoch": 1.4292773995168022, + "grad_norm": 2.339303970336914, + "learning_rate": 1e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7154675722122192, + "num_tokens": 329323011.0, + "step": 13015 + }, + { + "epoch": 1.4293872172194158, + "grad_norm": 2.3401124477386475, + "learning_rate": 1e-06, + "loss": 0.9029, + "mean_token_accuracy": 0.7168259024620056, + "num_tokens": 329348318.0, + "step": 13016 + }, + { + "epoch": 1.4294970349220295, + "grad_norm": 2.1796278953552246, + "learning_rate": 1e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.7096117734909058, + "num_tokens": 329376965.0, + "step": 13017 + }, + { + "epoch": 1.429606852624643, + "grad_norm": 2.535125494003296, + "learning_rate": 1e-06, + "loss": 0.8424, + "mean_token_accuracy": 0.7340248823165894, + "num_tokens": 329397558.0, + "step": 13018 + }, + { + "epoch": 1.4297166703272568, + "grad_norm": 2.430187940597534, + "learning_rate": 1e-06, + "loss": 0.7905, + "mean_token_accuracy": 0.744269847869873, + "num_tokens": 329418486.0, + "step": 13019 + }, + { + "epoch": 1.4298264880298703, + "grad_norm": 2.0637054443359375, + "learning_rate": 1e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.7177639007568359, + "num_tokens": 329447477.0, + "step": 13020 + }, + { + "epoch": 1.429936305732484, + "grad_norm": 2.478095769882202, + "learning_rate": 1e-06, + "loss": 0.8971, + "mean_token_accuracy": 0.7159510850906372, + "num_tokens": 329470293.0, + "step": 13021 + }, + { + "epoch": 1.4300461234350976, + "grad_norm": 2.11049485206604, + "learning_rate": 1e-06, + "loss": 0.8926, + "mean_token_accuracy": 0.7197440266609192, + "num_tokens": 329499506.0, + "step": 13022 + }, + { + "epoch": 1.4301559411377114, + "grad_norm": 2.348937511444092, + "learning_rate": 1e-06, + "loss": 0.9748, + "mean_token_accuracy": 0.7032366991043091, + "num_tokens": 329525417.0, + "step": 13023 + }, + { + "epoch": 1.4302657588403251, + "grad_norm": 2.505476474761963, + "learning_rate": 1e-06, + "loss": 0.7618, + "mean_token_accuracy": 0.7582175135612488, + "num_tokens": 329544957.0, + "step": 13024 + }, + { + "epoch": 1.4303755765429387, + "grad_norm": 2.4295222759246826, + "learning_rate": 1e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7218363285064697, + "num_tokens": 329568450.0, + "step": 13025 + }, + { + "epoch": 1.4304853942455524, + "grad_norm": 2.134953022003174, + "learning_rate": 1e-06, + "loss": 0.909, + "mean_token_accuracy": 0.7132753133773804, + "num_tokens": 329596468.0, + "step": 13026 + }, + { + "epoch": 1.430595211948166, + "grad_norm": 2.3065407276153564, + "learning_rate": 1e-06, + "loss": 0.8459, + "mean_token_accuracy": 0.73606938123703, + "num_tokens": 329620389.0, + "step": 13027 + }, + { + "epoch": 1.4307050296507797, + "grad_norm": 2.1822116374969482, + "learning_rate": 1e-06, + "loss": 0.9001, + "mean_token_accuracy": 0.7252947092056274, + "num_tokens": 329647630.0, + "step": 13028 + }, + { + "epoch": 1.4308148473533935, + "grad_norm": 2.034902811050415, + "learning_rate": 1e-06, + "loss": 0.9966, + "mean_token_accuracy": 0.6964003443717957, + "num_tokens": 329680228.0, + "step": 13029 + }, + { + "epoch": 1.430924665056007, + "grad_norm": 2.179168939590454, + "learning_rate": 1e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.7148647904396057, + "num_tokens": 329707262.0, + "step": 13030 + }, + { + "epoch": 1.4310344827586206, + "grad_norm": 2.380328893661499, + "learning_rate": 1e-06, + "loss": 0.8959, + "mean_token_accuracy": 0.7233498096466064, + "num_tokens": 329730505.0, + "step": 13031 + }, + { + "epoch": 1.4311443004612343, + "grad_norm": 2.089977979660034, + "learning_rate": 1e-06, + "loss": 0.8551, + "mean_token_accuracy": 0.7341570258140564, + "num_tokens": 329757898.0, + "step": 13032 + }, + { + "epoch": 1.431254118163848, + "grad_norm": 2.1695446968078613, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7055414319038391, + "num_tokens": 329784985.0, + "step": 13033 + }, + { + "epoch": 1.4313639358664616, + "grad_norm": 2.254194498062134, + "learning_rate": 1e-06, + "loss": 0.9254, + "mean_token_accuracy": 0.7087939381599426, + "num_tokens": 329808849.0, + "step": 13034 + }, + { + "epoch": 1.4314737535690754, + "grad_norm": 2.473400592803955, + "learning_rate": 1e-06, + "loss": 0.8772, + "mean_token_accuracy": 0.7223512530326843, + "num_tokens": 329832009.0, + "step": 13035 + }, + { + "epoch": 1.431583571271689, + "grad_norm": 2.4190938472747803, + "learning_rate": 1e-06, + "loss": 0.8889, + "mean_token_accuracy": 0.7269397974014282, + "num_tokens": 329855326.0, + "step": 13036 + }, + { + "epoch": 1.4316933889743026, + "grad_norm": 2.085905075073242, + "learning_rate": 1e-06, + "loss": 0.9128, + "mean_token_accuracy": 0.7171889543533325, + "num_tokens": 329884083.0, + "step": 13037 + }, + { + "epoch": 1.4318032066769164, + "grad_norm": 2.099120616912842, + "learning_rate": 1e-06, + "loss": 0.8685, + "mean_token_accuracy": 0.7385104298591614, + "num_tokens": 329912429.0, + "step": 13038 + }, + { + "epoch": 1.43191302437953, + "grad_norm": 2.2155370712280273, + "learning_rate": 1e-06, + "loss": 0.9989, + "mean_token_accuracy": 0.6980410814285278, + "num_tokens": 329938726.0, + "step": 13039 + }, + { + "epoch": 1.4320228420821437, + "grad_norm": 2.0978403091430664, + "learning_rate": 1e-06, + "loss": 0.8313, + "mean_token_accuracy": 0.7308588624000549, + "num_tokens": 329966582.0, + "step": 13040 + }, + { + "epoch": 1.4321326597847572, + "grad_norm": 2.21234393119812, + "learning_rate": 1e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.694777250289917, + "num_tokens": 329995252.0, + "step": 13041 + }, + { + "epoch": 1.432242477487371, + "grad_norm": 2.20334792137146, + "learning_rate": 1e-06, + "loss": 0.8972, + "mean_token_accuracy": 0.7197652459144592, + "num_tokens": 330022292.0, + "step": 13042 + }, + { + "epoch": 1.4323522951899847, + "grad_norm": 2.1153972148895264, + "learning_rate": 1e-06, + "loss": 0.8214, + "mean_token_accuracy": 0.7357615232467651, + "num_tokens": 330048056.0, + "step": 13043 + }, + { + "epoch": 1.4324621128925983, + "grad_norm": 2.3675742149353027, + "learning_rate": 1e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.7138453125953674, + "num_tokens": 330071798.0, + "step": 13044 + }, + { + "epoch": 1.4325719305952118, + "grad_norm": 2.320939302444458, + "learning_rate": 1e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.7241995930671692, + "num_tokens": 330094152.0, + "step": 13045 + }, + { + "epoch": 1.4326817482978256, + "grad_norm": 1.7844436168670654, + "learning_rate": 1e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7191644906997681, + "num_tokens": 330129181.0, + "step": 13046 + }, + { + "epoch": 1.4327915660004393, + "grad_norm": 2.1428208351135254, + "learning_rate": 1e-06, + "loss": 0.9541, + "mean_token_accuracy": 0.7044707536697388, + "num_tokens": 330155758.0, + "step": 13047 + }, + { + "epoch": 1.4329013837030529, + "grad_norm": 2.296868085861206, + "learning_rate": 1e-06, + "loss": 0.8684, + "mean_token_accuracy": 0.7328605651855469, + "num_tokens": 330180676.0, + "step": 13048 + }, + { + "epoch": 1.4330112014056666, + "grad_norm": 2.572469472885132, + "learning_rate": 1e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.7162337303161621, + "num_tokens": 330201888.0, + "step": 13049 + }, + { + "epoch": 1.4331210191082802, + "grad_norm": 1.8885585069656372, + "learning_rate": 1e-06, + "loss": 0.8896, + "mean_token_accuracy": 0.7243994474411011, + "num_tokens": 330236354.0, + "step": 13050 + }, + { + "epoch": 1.433230836810894, + "grad_norm": 2.396282196044922, + "learning_rate": 1e-06, + "loss": 0.9008, + "mean_token_accuracy": 0.7252504229545593, + "num_tokens": 330259107.0, + "step": 13051 + }, + { + "epoch": 1.4333406545135077, + "grad_norm": 2.262901544570923, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7102004289627075, + "num_tokens": 330283413.0, + "step": 13052 + }, + { + "epoch": 1.4334504722161212, + "grad_norm": 2.3544576168060303, + "learning_rate": 1e-06, + "loss": 0.8983, + "mean_token_accuracy": 0.7291368246078491, + "num_tokens": 330307178.0, + "step": 13053 + }, + { + "epoch": 1.433560289918735, + "grad_norm": 2.6392576694488525, + "learning_rate": 1e-06, + "loss": 0.8482, + "mean_token_accuracy": 0.7384790182113647, + "num_tokens": 330325310.0, + "step": 13054 + }, + { + "epoch": 1.4336701076213485, + "grad_norm": 2.2526702880859375, + "learning_rate": 1e-06, + "loss": 0.855, + "mean_token_accuracy": 0.7314231991767883, + "num_tokens": 330349414.0, + "step": 13055 + }, + { + "epoch": 1.4337799253239623, + "grad_norm": 2.329775094985962, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.7084351181983948, + "num_tokens": 330374048.0, + "step": 13056 + }, + { + "epoch": 1.433889743026576, + "grad_norm": 2.2730796337127686, + "learning_rate": 1e-06, + "loss": 0.8807, + "mean_token_accuracy": 0.7260228395462036, + "num_tokens": 330398311.0, + "step": 13057 + }, + { + "epoch": 1.4339995607291895, + "grad_norm": 2.388165235519409, + "learning_rate": 1e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7111376523971558, + "num_tokens": 330421398.0, + "step": 13058 + }, + { + "epoch": 1.434109378431803, + "grad_norm": 2.2812914848327637, + "learning_rate": 1e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.717166543006897, + "num_tokens": 330446203.0, + "step": 13059 + }, + { + "epoch": 1.4342191961344168, + "grad_norm": 2.28507924079895, + "learning_rate": 1e-06, + "loss": 0.9137, + "mean_token_accuracy": 0.7121747136116028, + "num_tokens": 330475199.0, + "step": 13060 + }, + { + "epoch": 1.4343290138370306, + "grad_norm": 2.4775609970092773, + "learning_rate": 1e-06, + "loss": 0.8932, + "mean_token_accuracy": 0.7164968252182007, + "num_tokens": 330499597.0, + "step": 13061 + }, + { + "epoch": 1.4344388315396441, + "grad_norm": 2.483821392059326, + "learning_rate": 1e-06, + "loss": 0.8931, + "mean_token_accuracy": 0.7249065637588501, + "num_tokens": 330523721.0, + "step": 13062 + }, + { + "epoch": 1.434548649242258, + "grad_norm": 2.4083077907562256, + "learning_rate": 1e-06, + "loss": 0.9738, + "mean_token_accuracy": 0.6968745589256287, + "num_tokens": 330548649.0, + "step": 13063 + }, + { + "epoch": 1.4346584669448714, + "grad_norm": 2.5520036220550537, + "learning_rate": 1e-06, + "loss": 0.8371, + "mean_token_accuracy": 0.7365632057189941, + "num_tokens": 330566626.0, + "step": 13064 + }, + { + "epoch": 1.4347682846474852, + "grad_norm": 2.329698085784912, + "learning_rate": 1e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.717349112033844, + "num_tokens": 330590984.0, + "step": 13065 + }, + { + "epoch": 1.434878102350099, + "grad_norm": 2.3266983032226562, + "learning_rate": 1e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7228381633758545, + "num_tokens": 330615834.0, + "step": 13066 + }, + { + "epoch": 1.4349879200527125, + "grad_norm": 2.3088877201080322, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7121444940567017, + "num_tokens": 330639378.0, + "step": 13067 + }, + { + "epoch": 1.4350977377553262, + "grad_norm": 2.2628066539764404, + "learning_rate": 1e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7195638418197632, + "num_tokens": 330663916.0, + "step": 13068 + }, + { + "epoch": 1.4352075554579398, + "grad_norm": 2.859302282333374, + "learning_rate": 1e-06, + "loss": 0.8021, + "mean_token_accuracy": 0.7475007772445679, + "num_tokens": 330682563.0, + "step": 13069 + }, + { + "epoch": 1.4353173731605535, + "grad_norm": 2.061354875564575, + "learning_rate": 1e-06, + "loss": 0.8462, + "mean_token_accuracy": 0.73249751329422, + "num_tokens": 330711802.0, + "step": 13070 + }, + { + "epoch": 1.4354271908631673, + "grad_norm": 2.154163360595703, + "learning_rate": 1e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.7129116058349609, + "num_tokens": 330742129.0, + "step": 13071 + }, + { + "epoch": 1.4355370085657808, + "grad_norm": 2.4499573707580566, + "learning_rate": 1e-06, + "loss": 0.9051, + "mean_token_accuracy": 0.7201504707336426, + "num_tokens": 330765421.0, + "step": 13072 + }, + { + "epoch": 1.4356468262683943, + "grad_norm": 2.477301597595215, + "learning_rate": 1e-06, + "loss": 0.8912, + "mean_token_accuracy": 0.7195389866828918, + "num_tokens": 330788367.0, + "step": 13073 + }, + { + "epoch": 1.435756643971008, + "grad_norm": 2.3732686042785645, + "learning_rate": 1e-06, + "loss": 0.8386, + "mean_token_accuracy": 0.7343789339065552, + "num_tokens": 330808840.0, + "step": 13074 + }, + { + "epoch": 1.4358664616736219, + "grad_norm": 2.2019565105438232, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7093681693077087, + "num_tokens": 330835677.0, + "step": 13075 + }, + { + "epoch": 1.4359762793762354, + "grad_norm": 2.0628628730773926, + "learning_rate": 1e-06, + "loss": 0.823, + "mean_token_accuracy": 0.7463729381561279, + "num_tokens": 330865133.0, + "step": 13076 + }, + { + "epoch": 1.4360860970788492, + "grad_norm": 2.0540425777435303, + "learning_rate": 1e-06, + "loss": 1.0241, + "mean_token_accuracy": 0.6895852088928223, + "num_tokens": 330895733.0, + "step": 13077 + }, + { + "epoch": 1.4361959147814627, + "grad_norm": 2.5855863094329834, + "learning_rate": 1e-06, + "loss": 0.8566, + "mean_token_accuracy": 0.7310341596603394, + "num_tokens": 330916863.0, + "step": 13078 + }, + { + "epoch": 1.4363057324840764, + "grad_norm": 2.184966802597046, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.706627368927002, + "num_tokens": 330943200.0, + "step": 13079 + }, + { + "epoch": 1.4364155501866902, + "grad_norm": 2.6632161140441895, + "learning_rate": 1e-06, + "loss": 0.853, + "mean_token_accuracy": 0.7351397275924683, + "num_tokens": 330963162.0, + "step": 13080 + }, + { + "epoch": 1.4365253678893037, + "grad_norm": 2.727480411529541, + "learning_rate": 1e-06, + "loss": 0.8488, + "mean_token_accuracy": 0.733830451965332, + "num_tokens": 330982354.0, + "step": 13081 + }, + { + "epoch": 1.4366351855919175, + "grad_norm": 2.6454269886016846, + "learning_rate": 1e-06, + "loss": 0.8738, + "mean_token_accuracy": 0.7215677499771118, + "num_tokens": 331000556.0, + "step": 13082 + }, + { + "epoch": 1.436745003294531, + "grad_norm": 2.374141216278076, + "learning_rate": 1e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7140447497367859, + "num_tokens": 331025698.0, + "step": 13083 + }, + { + "epoch": 1.4368548209971448, + "grad_norm": 2.172947883605957, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.7078852653503418, + "num_tokens": 331053471.0, + "step": 13084 + }, + { + "epoch": 1.4369646386997583, + "grad_norm": 2.2069039344787598, + "learning_rate": 1e-06, + "loss": 0.8749, + "mean_token_accuracy": 0.7251549959182739, + "num_tokens": 331079638.0, + "step": 13085 + }, + { + "epoch": 1.437074456402372, + "grad_norm": 2.3050196170806885, + "learning_rate": 1e-06, + "loss": 0.8852, + "mean_token_accuracy": 0.7289484739303589, + "num_tokens": 331105201.0, + "step": 13086 + }, + { + "epoch": 1.4371842741049856, + "grad_norm": 2.219883918762207, + "learning_rate": 1e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.7110306620597839, + "num_tokens": 331130026.0, + "step": 13087 + }, + { + "epoch": 1.4372940918075994, + "grad_norm": 2.5719363689422607, + "learning_rate": 1e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.7110549211502075, + "num_tokens": 331150522.0, + "step": 13088 + }, + { + "epoch": 1.4374039095102131, + "grad_norm": 2.1949055194854736, + "learning_rate": 1e-06, + "loss": 0.8483, + "mean_token_accuracy": 0.733389139175415, + "num_tokens": 331176598.0, + "step": 13089 + }, + { + "epoch": 1.4375137272128267, + "grad_norm": 2.1622629165649414, + "learning_rate": 1e-06, + "loss": 0.9101, + "mean_token_accuracy": 0.7226806879043579, + "num_tokens": 331205055.0, + "step": 13090 + }, + { + "epoch": 1.4376235449154404, + "grad_norm": 2.428020715713501, + "learning_rate": 1e-06, + "loss": 0.847, + "mean_token_accuracy": 0.7391335368156433, + "num_tokens": 331227603.0, + "step": 13091 + }, + { + "epoch": 1.437733362618054, + "grad_norm": 2.4620487689971924, + "learning_rate": 1e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.7233628034591675, + "num_tokens": 331248957.0, + "step": 13092 + }, + { + "epoch": 1.4378431803206677, + "grad_norm": 2.4011662006378174, + "learning_rate": 1e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7038141489028931, + "num_tokens": 331271118.0, + "step": 13093 + }, + { + "epoch": 1.4379529980232815, + "grad_norm": 2.178131341934204, + "learning_rate": 1e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7175699472427368, + "num_tokens": 331299610.0, + "step": 13094 + }, + { + "epoch": 1.438062815725895, + "grad_norm": 2.156064748764038, + "learning_rate": 1e-06, + "loss": 0.8675, + "mean_token_accuracy": 0.7329484224319458, + "num_tokens": 331326751.0, + "step": 13095 + }, + { + "epoch": 1.4381726334285085, + "grad_norm": 2.326704502105713, + "learning_rate": 1e-06, + "loss": 0.8818, + "mean_token_accuracy": 0.7231228351593018, + "num_tokens": 331350433.0, + "step": 13096 + }, + { + "epoch": 1.4382824511311223, + "grad_norm": 2.134214162826538, + "learning_rate": 1e-06, + "loss": 0.9607, + "mean_token_accuracy": 0.704065203666687, + "num_tokens": 331379686.0, + "step": 13097 + }, + { + "epoch": 1.438392268833736, + "grad_norm": 2.4289705753326416, + "learning_rate": 1e-06, + "loss": 0.8835, + "mean_token_accuracy": 0.7304190397262573, + "num_tokens": 331402112.0, + "step": 13098 + }, + { + "epoch": 1.4385020865363496, + "grad_norm": 2.768674612045288, + "learning_rate": 1e-06, + "loss": 0.8509, + "mean_token_accuracy": 0.7281231880187988, + "num_tokens": 331421489.0, + "step": 13099 + }, + { + "epoch": 1.4386119042389633, + "grad_norm": 2.365299701690674, + "learning_rate": 1e-06, + "loss": 0.8616, + "mean_token_accuracy": 0.7290171384811401, + "num_tokens": 331444376.0, + "step": 13100 + }, + { + "epoch": 1.4387217219415769, + "grad_norm": 2.35448956489563, + "learning_rate": 1e-06, + "loss": 0.8787, + "mean_token_accuracy": 0.723983883857727, + "num_tokens": 331467269.0, + "step": 13101 + }, + { + "epoch": 1.4388315396441906, + "grad_norm": 2.137638807296753, + "learning_rate": 1e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.718830943107605, + "num_tokens": 331495936.0, + "step": 13102 + }, + { + "epoch": 1.4389413573468044, + "grad_norm": 2.291114330291748, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7170343995094299, + "num_tokens": 331520494.0, + "step": 13103 + }, + { + "epoch": 1.439051175049418, + "grad_norm": 1.9675318002700806, + "learning_rate": 1e-06, + "loss": 0.8211, + "mean_token_accuracy": 0.7374700307846069, + "num_tokens": 331551021.0, + "step": 13104 + }, + { + "epoch": 1.4391609927520317, + "grad_norm": 1.9092713594436646, + "learning_rate": 1e-06, + "loss": 0.8407, + "mean_token_accuracy": 0.7403513193130493, + "num_tokens": 331582757.0, + "step": 13105 + }, + { + "epoch": 1.4392708104546452, + "grad_norm": 1.9520047903060913, + "learning_rate": 1e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.7115791440010071, + "num_tokens": 331615847.0, + "step": 13106 + }, + { + "epoch": 1.439380628157259, + "grad_norm": 2.170051336288452, + "learning_rate": 1e-06, + "loss": 0.8843, + "mean_token_accuracy": 0.7243497371673584, + "num_tokens": 331644577.0, + "step": 13107 + }, + { + "epoch": 1.4394904458598727, + "grad_norm": 2.021359920501709, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.7009695172309875, + "num_tokens": 331673830.0, + "step": 13108 + }, + { + "epoch": 1.4396002635624863, + "grad_norm": 2.113539218902588, + "learning_rate": 1e-06, + "loss": 1.0813, + "mean_token_accuracy": 0.6793360710144043, + "num_tokens": 331702021.0, + "step": 13109 + }, + { + "epoch": 1.4397100812650998, + "grad_norm": 2.1444790363311768, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.7066663503646851, + "num_tokens": 331729878.0, + "step": 13110 + }, + { + "epoch": 1.4398198989677136, + "grad_norm": 2.1548566818237305, + "learning_rate": 1e-06, + "loss": 0.8329, + "mean_token_accuracy": 0.7371946573257446, + "num_tokens": 331754142.0, + "step": 13111 + }, + { + "epoch": 1.4399297166703273, + "grad_norm": 2.18988299369812, + "learning_rate": 1e-06, + "loss": 0.9082, + "mean_token_accuracy": 0.71578449010849, + "num_tokens": 331781830.0, + "step": 13112 + }, + { + "epoch": 1.4400395343729409, + "grad_norm": 2.2096879482269287, + "learning_rate": 1e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.7135273218154907, + "num_tokens": 331807149.0, + "step": 13113 + }, + { + "epoch": 1.4401493520755546, + "grad_norm": 2.2862703800201416, + "learning_rate": 1e-06, + "loss": 0.9256, + "mean_token_accuracy": 0.7106199860572815, + "num_tokens": 331833320.0, + "step": 13114 + }, + { + "epoch": 1.4402591697781681, + "grad_norm": 2.1151535511016846, + "learning_rate": 1e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.7123346924781799, + "num_tokens": 331860851.0, + "step": 13115 + }, + { + "epoch": 1.440368987480782, + "grad_norm": 2.368389368057251, + "learning_rate": 1e-06, + "loss": 0.8856, + "mean_token_accuracy": 0.7249132394790649, + "num_tokens": 331884756.0, + "step": 13116 + }, + { + "epoch": 1.4404788051833957, + "grad_norm": 2.4156956672668457, + "learning_rate": 1e-06, + "loss": 0.8319, + "mean_token_accuracy": 0.7374088168144226, + "num_tokens": 331906565.0, + "step": 13117 + }, + { + "epoch": 1.4405886228860092, + "grad_norm": 2.226834535598755, + "learning_rate": 1e-06, + "loss": 0.902, + "mean_token_accuracy": 0.7197631001472473, + "num_tokens": 331931361.0, + "step": 13118 + }, + { + "epoch": 1.440698440588623, + "grad_norm": 2.2267558574676514, + "learning_rate": 1e-06, + "loss": 1.02, + "mean_token_accuracy": 0.6971442103385925, + "num_tokens": 331960137.0, + "step": 13119 + }, + { + "epoch": 1.4408082582912365, + "grad_norm": 2.595320701599121, + "learning_rate": 1e-06, + "loss": 0.8979, + "mean_token_accuracy": 0.7217316627502441, + "num_tokens": 331981214.0, + "step": 13120 + }, + { + "epoch": 1.4409180759938502, + "grad_norm": 2.3862388134002686, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7138845324516296, + "num_tokens": 332004869.0, + "step": 13121 + }, + { + "epoch": 1.441027893696464, + "grad_norm": 2.2579903602600098, + "learning_rate": 1e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.7140172123908997, + "num_tokens": 332029792.0, + "step": 13122 + }, + { + "epoch": 1.4411377113990775, + "grad_norm": 2.4978222846984863, + "learning_rate": 1e-06, + "loss": 0.8332, + "mean_token_accuracy": 0.7301192283630371, + "num_tokens": 332050714.0, + "step": 13123 + }, + { + "epoch": 1.441247529101691, + "grad_norm": 2.137820243835449, + "learning_rate": 1e-06, + "loss": 0.9084, + "mean_token_accuracy": 0.7154384851455688, + "num_tokens": 332077564.0, + "step": 13124 + }, + { + "epoch": 1.4413573468043048, + "grad_norm": 2.326415538787842, + "learning_rate": 1e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.7022185921669006, + "num_tokens": 332101951.0, + "step": 13125 + }, + { + "epoch": 1.4414671645069186, + "grad_norm": 2.0490951538085938, + "learning_rate": 1e-06, + "loss": 0.8949, + "mean_token_accuracy": 0.7198227047920227, + "num_tokens": 332130691.0, + "step": 13126 + }, + { + "epoch": 1.4415769822095321, + "grad_norm": 2.6490705013275146, + "learning_rate": 1e-06, + "loss": 0.8903, + "mean_token_accuracy": 0.7227274179458618, + "num_tokens": 332152590.0, + "step": 13127 + }, + { + "epoch": 1.4416867999121459, + "grad_norm": 2.171077013015747, + "learning_rate": 1e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.7062174081802368, + "num_tokens": 332178500.0, + "step": 13128 + }, + { + "epoch": 1.4417966176147594, + "grad_norm": 2.1426284313201904, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.7028462886810303, + "num_tokens": 332209054.0, + "step": 13129 + }, + { + "epoch": 1.4419064353173732, + "grad_norm": 2.4144856929779053, + "learning_rate": 1e-06, + "loss": 0.8856, + "mean_token_accuracy": 0.7337244749069214, + "num_tokens": 332231893.0, + "step": 13130 + }, + { + "epoch": 1.442016253019987, + "grad_norm": 2.1563467979431152, + "learning_rate": 1e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7141356468200684, + "num_tokens": 332258930.0, + "step": 13131 + }, + { + "epoch": 1.4421260707226005, + "grad_norm": 2.271777868270874, + "learning_rate": 1e-06, + "loss": 0.8808, + "mean_token_accuracy": 0.7228978872299194, + "num_tokens": 332281865.0, + "step": 13132 + }, + { + "epoch": 1.4422358884252142, + "grad_norm": 2.4406275749206543, + "learning_rate": 1e-06, + "loss": 0.8755, + "mean_token_accuracy": 0.7259419560432434, + "num_tokens": 332305150.0, + "step": 13133 + }, + { + "epoch": 1.4423457061278278, + "grad_norm": 2.224477767944336, + "learning_rate": 1e-06, + "loss": 0.9449, + "mean_token_accuracy": 0.7073605060577393, + "num_tokens": 332332148.0, + "step": 13134 + }, + { + "epoch": 1.4424555238304415, + "grad_norm": 2.221806287765503, + "learning_rate": 1e-06, + "loss": 0.861, + "mean_token_accuracy": 0.7287735939025879, + "num_tokens": 332357188.0, + "step": 13135 + }, + { + "epoch": 1.442565341533055, + "grad_norm": 2.10929536819458, + "learning_rate": 1e-06, + "loss": 0.8924, + "mean_token_accuracy": 0.7205163836479187, + "num_tokens": 332385133.0, + "step": 13136 + }, + { + "epoch": 1.4426751592356688, + "grad_norm": 2.226257085800171, + "learning_rate": 1e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7019155621528625, + "num_tokens": 332410560.0, + "step": 13137 + }, + { + "epoch": 1.4427849769382823, + "grad_norm": 2.529200553894043, + "learning_rate": 1e-06, + "loss": 0.8771, + "mean_token_accuracy": 0.7238249778747559, + "num_tokens": 332431388.0, + "step": 13138 + }, + { + "epoch": 1.442894794640896, + "grad_norm": 2.181438446044922, + "learning_rate": 1e-06, + "loss": 0.8407, + "mean_token_accuracy": 0.7354792952537537, + "num_tokens": 332457734.0, + "step": 13139 + }, + { + "epoch": 1.4430046123435099, + "grad_norm": 2.420955181121826, + "learning_rate": 1e-06, + "loss": 0.87, + "mean_token_accuracy": 0.7204595804214478, + "num_tokens": 332481178.0, + "step": 13140 + }, + { + "epoch": 1.4431144300461234, + "grad_norm": 2.711308479309082, + "learning_rate": 1e-06, + "loss": 0.8458, + "mean_token_accuracy": 0.7320977449417114, + "num_tokens": 332499676.0, + "step": 13141 + }, + { + "epoch": 1.4432242477487371, + "grad_norm": 2.016050338745117, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.6989684700965881, + "num_tokens": 332533505.0, + "step": 13142 + }, + { + "epoch": 1.4433340654513507, + "grad_norm": 2.5146420001983643, + "learning_rate": 1e-06, + "loss": 0.8952, + "mean_token_accuracy": 0.7266004681587219, + "num_tokens": 332555318.0, + "step": 13143 + }, + { + "epoch": 1.4434438831539644, + "grad_norm": 2.3674941062927246, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.720488429069519, + "num_tokens": 332580523.0, + "step": 13144 + }, + { + "epoch": 1.4435537008565782, + "grad_norm": 1.934749960899353, + "learning_rate": 1e-06, + "loss": 0.948, + "mean_token_accuracy": 0.7058779001235962, + "num_tokens": 332617014.0, + "step": 13145 + }, + { + "epoch": 1.4436635185591917, + "grad_norm": 2.3449597358703613, + "learning_rate": 1e-06, + "loss": 0.9587, + "mean_token_accuracy": 0.7087048292160034, + "num_tokens": 332642580.0, + "step": 13146 + }, + { + "epoch": 1.4437733362618055, + "grad_norm": 2.657892942428589, + "learning_rate": 1e-06, + "loss": 0.7725, + "mean_token_accuracy": 0.7498404383659363, + "num_tokens": 332660269.0, + "step": 13147 + }, + { + "epoch": 1.443883153964419, + "grad_norm": 2.0727622509002686, + "learning_rate": 1e-06, + "loss": 1.0016, + "mean_token_accuracy": 0.691646933555603, + "num_tokens": 332688302.0, + "step": 13148 + }, + { + "epoch": 1.4439929716670328, + "grad_norm": 2.152897834777832, + "learning_rate": 1e-06, + "loss": 0.8582, + "mean_token_accuracy": 0.7336587905883789, + "num_tokens": 332716610.0, + "step": 13149 + }, + { + "epoch": 1.4441027893696463, + "grad_norm": 2.378387928009033, + "learning_rate": 1e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.7059557437896729, + "num_tokens": 332740154.0, + "step": 13150 + }, + { + "epoch": 1.44421260707226, + "grad_norm": 2.0909645557403564, + "learning_rate": 1e-06, + "loss": 0.8871, + "mean_token_accuracy": 0.7270302772521973, + "num_tokens": 332767877.0, + "step": 13151 + }, + { + "epoch": 1.4443224247748736, + "grad_norm": 2.087618589401245, + "learning_rate": 1e-06, + "loss": 0.8957, + "mean_token_accuracy": 0.7221938967704773, + "num_tokens": 332796731.0, + "step": 13152 + }, + { + "epoch": 1.4444322424774874, + "grad_norm": 2.290189027786255, + "learning_rate": 1e-06, + "loss": 0.8569, + "mean_token_accuracy": 0.7288163900375366, + "num_tokens": 332821477.0, + "step": 13153 + }, + { + "epoch": 1.4445420601801011, + "grad_norm": 2.7604377269744873, + "learning_rate": 1e-06, + "loss": 0.7661, + "mean_token_accuracy": 0.7471170425415039, + "num_tokens": 332839610.0, + "step": 13154 + }, + { + "epoch": 1.4446518778827147, + "grad_norm": 2.530827045440674, + "learning_rate": 1e-06, + "loss": 0.878, + "mean_token_accuracy": 0.722464382648468, + "num_tokens": 332862256.0, + "step": 13155 + }, + { + "epoch": 1.4447616955853284, + "grad_norm": 2.1610774993896484, + "learning_rate": 1e-06, + "loss": 1.0022, + "mean_token_accuracy": 0.7017168402671814, + "num_tokens": 332889979.0, + "step": 13156 + }, + { + "epoch": 1.444871513287942, + "grad_norm": 2.428126573562622, + "learning_rate": 1e-06, + "loss": 0.851, + "mean_token_accuracy": 0.7357629537582397, + "num_tokens": 332912139.0, + "step": 13157 + }, + { + "epoch": 1.4449813309905557, + "grad_norm": 2.28731369972229, + "learning_rate": 1e-06, + "loss": 0.8495, + "mean_token_accuracy": 0.7446056604385376, + "num_tokens": 332937076.0, + "step": 13158 + }, + { + "epoch": 1.4450911486931695, + "grad_norm": 2.54718017578125, + "learning_rate": 1e-06, + "loss": 0.8488, + "mean_token_accuracy": 0.7406393885612488, + "num_tokens": 332959296.0, + "step": 13159 + }, + { + "epoch": 1.445200966395783, + "grad_norm": 2.103160858154297, + "learning_rate": 1e-06, + "loss": 0.848, + "mean_token_accuracy": 0.7320498824119568, + "num_tokens": 332985313.0, + "step": 13160 + }, + { + "epoch": 1.4453107840983965, + "grad_norm": 2.298018217086792, + "learning_rate": 1e-06, + "loss": 0.8311, + "mean_token_accuracy": 0.7387657165527344, + "num_tokens": 333008859.0, + "step": 13161 + }, + { + "epoch": 1.4454206018010103, + "grad_norm": 2.5368707180023193, + "learning_rate": 1e-06, + "loss": 0.8268, + "mean_token_accuracy": 0.7335302829742432, + "num_tokens": 333031171.0, + "step": 13162 + }, + { + "epoch": 1.445530419503624, + "grad_norm": 2.2591893672943115, + "learning_rate": 1e-06, + "loss": 0.8673, + "mean_token_accuracy": 0.7275988459587097, + "num_tokens": 333058700.0, + "step": 13163 + }, + { + "epoch": 1.4456402372062376, + "grad_norm": 2.105194330215454, + "learning_rate": 1e-06, + "loss": 0.8824, + "mean_token_accuracy": 0.7272433042526245, + "num_tokens": 333088779.0, + "step": 13164 + }, + { + "epoch": 1.4457500549088513, + "grad_norm": 2.3155670166015625, + "learning_rate": 1e-06, + "loss": 0.8536, + "mean_token_accuracy": 0.7331153750419617, + "num_tokens": 333113173.0, + "step": 13165 + }, + { + "epoch": 1.4458598726114649, + "grad_norm": 2.5712265968322754, + "learning_rate": 1e-06, + "loss": 0.7662, + "mean_token_accuracy": 0.7484325170516968, + "num_tokens": 333132851.0, + "step": 13166 + }, + { + "epoch": 1.4459696903140786, + "grad_norm": 2.3638012409210205, + "learning_rate": 1e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.7265092134475708, + "num_tokens": 333156339.0, + "step": 13167 + }, + { + "epoch": 1.4460795080166924, + "grad_norm": 2.2870218753814697, + "learning_rate": 1e-06, + "loss": 0.9457, + "mean_token_accuracy": 0.7124422788619995, + "num_tokens": 333181970.0, + "step": 13168 + }, + { + "epoch": 1.446189325719306, + "grad_norm": 2.1898162364959717, + "learning_rate": 1e-06, + "loss": 0.8787, + "mean_token_accuracy": 0.7254685163497925, + "num_tokens": 333207163.0, + "step": 13169 + }, + { + "epoch": 1.4462991434219197, + "grad_norm": 2.2672836780548096, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.7073649168014526, + "num_tokens": 333233443.0, + "step": 13170 + }, + { + "epoch": 1.4464089611245332, + "grad_norm": 2.588351249694824, + "learning_rate": 1e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.7158951759338379, + "num_tokens": 333254096.0, + "step": 13171 + }, + { + "epoch": 1.446518778827147, + "grad_norm": 2.4920802116394043, + "learning_rate": 1e-06, + "loss": 0.8685, + "mean_token_accuracy": 0.7290887832641602, + "num_tokens": 333276662.0, + "step": 13172 + }, + { + "epoch": 1.4466285965297607, + "grad_norm": 2.17590594291687, + "learning_rate": 1e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.7330447435379028, + "num_tokens": 333302775.0, + "step": 13173 + }, + { + "epoch": 1.4467384142323743, + "grad_norm": 2.823967218399048, + "learning_rate": 1e-06, + "loss": 0.8032, + "mean_token_accuracy": 0.7428402304649353, + "num_tokens": 333320595.0, + "step": 13174 + }, + { + "epoch": 1.4468482319349878, + "grad_norm": 2.1752078533172607, + "learning_rate": 1e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.7109684944152832, + "num_tokens": 333350636.0, + "step": 13175 + }, + { + "epoch": 1.4469580496376016, + "grad_norm": 2.086549758911133, + "learning_rate": 1e-06, + "loss": 0.853, + "mean_token_accuracy": 0.7337325811386108, + "num_tokens": 333380446.0, + "step": 13176 + }, + { + "epoch": 1.4470678673402153, + "grad_norm": 2.361109972000122, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7126549482345581, + "num_tokens": 333402982.0, + "step": 13177 + }, + { + "epoch": 1.4471776850428288, + "grad_norm": 2.234525442123413, + "learning_rate": 1e-06, + "loss": 0.854, + "mean_token_accuracy": 0.7258620262145996, + "num_tokens": 333428137.0, + "step": 13178 + }, + { + "epoch": 1.4472875027454426, + "grad_norm": 2.6478312015533447, + "learning_rate": 1e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.7253555059432983, + "num_tokens": 333448664.0, + "step": 13179 + }, + { + "epoch": 1.4473973204480561, + "grad_norm": 2.1975114345550537, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7136811017990112, + "num_tokens": 333477145.0, + "step": 13180 + }, + { + "epoch": 1.44750713815067, + "grad_norm": 2.772742509841919, + "learning_rate": 1e-06, + "loss": 0.8334, + "mean_token_accuracy": 0.7480262517929077, + "num_tokens": 333496173.0, + "step": 13181 + }, + { + "epoch": 1.4476169558532836, + "grad_norm": 2.4072039127349854, + "learning_rate": 1e-06, + "loss": 0.8188, + "mean_token_accuracy": 0.7386517524719238, + "num_tokens": 333518093.0, + "step": 13182 + }, + { + "epoch": 1.4477267735558972, + "grad_norm": 2.275789499282837, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.7127392888069153, + "num_tokens": 333543715.0, + "step": 13183 + }, + { + "epoch": 1.447836591258511, + "grad_norm": 2.1704330444335938, + "learning_rate": 1e-06, + "loss": 0.7791, + "mean_token_accuracy": 0.747763454914093, + "num_tokens": 333569192.0, + "step": 13184 + }, + { + "epoch": 1.4479464089611245, + "grad_norm": 2.111116409301758, + "learning_rate": 1e-06, + "loss": 0.8268, + "mean_token_accuracy": 0.7417769432067871, + "num_tokens": 333596630.0, + "step": 13185 + }, + { + "epoch": 1.4480562266637382, + "grad_norm": 2.2788164615631104, + "learning_rate": 1e-06, + "loss": 0.898, + "mean_token_accuracy": 0.7230989933013916, + "num_tokens": 333622824.0, + "step": 13186 + }, + { + "epoch": 1.448166044366352, + "grad_norm": 2.310753107070923, + "learning_rate": 1e-06, + "loss": 0.8463, + "mean_token_accuracy": 0.7324954271316528, + "num_tokens": 333646634.0, + "step": 13187 + }, + { + "epoch": 1.4482758620689655, + "grad_norm": 2.253805160522461, + "learning_rate": 1e-06, + "loss": 0.927, + "mean_token_accuracy": 0.7133464217185974, + "num_tokens": 333673721.0, + "step": 13188 + }, + { + "epoch": 1.448385679771579, + "grad_norm": 2.040682554244995, + "learning_rate": 1e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.7086172699928284, + "num_tokens": 333702235.0, + "step": 13189 + }, + { + "epoch": 1.4484954974741928, + "grad_norm": 2.214207887649536, + "learning_rate": 1e-06, + "loss": 0.9065, + "mean_token_accuracy": 0.7249650955200195, + "num_tokens": 333729577.0, + "step": 13190 + }, + { + "epoch": 1.4486053151768066, + "grad_norm": 2.1427078247070312, + "learning_rate": 1e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.7039594650268555, + "num_tokens": 333755793.0, + "step": 13191 + }, + { + "epoch": 1.44871513287942, + "grad_norm": 2.2491161823272705, + "learning_rate": 1e-06, + "loss": 0.8699, + "mean_token_accuracy": 0.7299081087112427, + "num_tokens": 333780605.0, + "step": 13192 + }, + { + "epoch": 1.4488249505820339, + "grad_norm": 2.2455811500549316, + "learning_rate": 1e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7279052734375, + "num_tokens": 333805657.0, + "step": 13193 + }, + { + "epoch": 1.4489347682846474, + "grad_norm": 2.1453893184661865, + "learning_rate": 1e-06, + "loss": 1.0082, + "mean_token_accuracy": 0.6890611052513123, + "num_tokens": 333834650.0, + "step": 13194 + }, + { + "epoch": 1.4490445859872612, + "grad_norm": 2.0371904373168945, + "learning_rate": 1e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.7045438289642334, + "num_tokens": 333865831.0, + "step": 13195 + }, + { + "epoch": 1.449154403689875, + "grad_norm": 2.6390633583068848, + "learning_rate": 1e-06, + "loss": 0.8399, + "mean_token_accuracy": 0.7333126068115234, + "num_tokens": 333885784.0, + "step": 13196 + }, + { + "epoch": 1.4492642213924884, + "grad_norm": 2.156984567642212, + "learning_rate": 1e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.7083157896995544, + "num_tokens": 333913100.0, + "step": 13197 + }, + { + "epoch": 1.4493740390951022, + "grad_norm": 2.1745481491088867, + "learning_rate": 1e-06, + "loss": 0.8266, + "mean_token_accuracy": 0.7414736747741699, + "num_tokens": 333940869.0, + "step": 13198 + }, + { + "epoch": 1.4494838567977157, + "grad_norm": 2.276463031768799, + "learning_rate": 1e-06, + "loss": 0.9239, + "mean_token_accuracy": 0.7126032114028931, + "num_tokens": 333967007.0, + "step": 13199 + }, + { + "epoch": 1.4495936745003295, + "grad_norm": 1.9498412609100342, + "learning_rate": 1e-06, + "loss": 0.7847, + "mean_token_accuracy": 0.7592988014221191, + "num_tokens": 333994945.0, + "step": 13200 + }, + { + "epoch": 1.449703492202943, + "grad_norm": 2.3828580379486084, + "learning_rate": 1e-06, + "loss": 0.855, + "mean_token_accuracy": 0.7287511825561523, + "num_tokens": 334018487.0, + "step": 13201 + }, + { + "epoch": 1.4498133099055568, + "grad_norm": 2.253039598464966, + "learning_rate": 1e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.6971448063850403, + "num_tokens": 334044258.0, + "step": 13202 + }, + { + "epoch": 1.4499231276081703, + "grad_norm": 2.1152732372283936, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7026360034942627, + "num_tokens": 334073057.0, + "step": 13203 + }, + { + "epoch": 1.450032945310784, + "grad_norm": 2.228318929672241, + "learning_rate": 1e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7115168571472168, + "num_tokens": 334099178.0, + "step": 13204 + }, + { + "epoch": 1.4501427630133978, + "grad_norm": 2.32443904876709, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7043280601501465, + "num_tokens": 334123490.0, + "step": 13205 + }, + { + "epoch": 1.4502525807160114, + "grad_norm": 2.2729389667510986, + "learning_rate": 1e-06, + "loss": 0.7972, + "mean_token_accuracy": 0.7508769035339355, + "num_tokens": 334147830.0, + "step": 13206 + }, + { + "epoch": 1.4503623984186251, + "grad_norm": 2.768648147583008, + "learning_rate": 1e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.7172297239303589, + "num_tokens": 334170765.0, + "step": 13207 + }, + { + "epoch": 1.4504722161212387, + "grad_norm": 2.0777204036712646, + "learning_rate": 1e-06, + "loss": 0.8561, + "mean_token_accuracy": 0.7299075126647949, + "num_tokens": 334200202.0, + "step": 13208 + }, + { + "epoch": 1.4505820338238524, + "grad_norm": 2.0437583923339844, + "learning_rate": 1e-06, + "loss": 0.9004, + "mean_token_accuracy": 0.7297835946083069, + "num_tokens": 334228672.0, + "step": 13209 + }, + { + "epoch": 1.4506918515264662, + "grad_norm": 2.358154296875, + "learning_rate": 1e-06, + "loss": 0.8997, + "mean_token_accuracy": 0.7294294834136963, + "num_tokens": 334251441.0, + "step": 13210 + }, + { + "epoch": 1.4508016692290797, + "grad_norm": 2.2661092281341553, + "learning_rate": 1e-06, + "loss": 0.8891, + "mean_token_accuracy": 0.7181491851806641, + "num_tokens": 334276828.0, + "step": 13211 + }, + { + "epoch": 1.4509114869316933, + "grad_norm": 2.3374292850494385, + "learning_rate": 1e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7095018625259399, + "num_tokens": 334303297.0, + "step": 13212 + }, + { + "epoch": 1.451021304634307, + "grad_norm": 2.4506866931915283, + "learning_rate": 1e-06, + "loss": 0.8638, + "mean_token_accuracy": 0.7341153621673584, + "num_tokens": 334325943.0, + "step": 13213 + }, + { + "epoch": 1.4511311223369208, + "grad_norm": 2.199895143508911, + "learning_rate": 1e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.7218835949897766, + "num_tokens": 334353302.0, + "step": 13214 + }, + { + "epoch": 1.4512409400395343, + "grad_norm": 2.196424722671509, + "learning_rate": 1e-06, + "loss": 1.0348, + "mean_token_accuracy": 0.6851413249969482, + "num_tokens": 334381200.0, + "step": 13215 + }, + { + "epoch": 1.451350757742148, + "grad_norm": 2.4520926475524902, + "learning_rate": 1e-06, + "loss": 0.8485, + "mean_token_accuracy": 0.7252048254013062, + "num_tokens": 334402834.0, + "step": 13216 + }, + { + "epoch": 1.4514605754447616, + "grad_norm": 2.0525925159454346, + "learning_rate": 1e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.7192975282669067, + "num_tokens": 334431654.0, + "step": 13217 + }, + { + "epoch": 1.4515703931473753, + "grad_norm": 2.3516998291015625, + "learning_rate": 1e-06, + "loss": 0.8907, + "mean_token_accuracy": 0.7231719493865967, + "num_tokens": 334455124.0, + "step": 13218 + }, + { + "epoch": 1.451680210849989, + "grad_norm": 2.3868279457092285, + "learning_rate": 1e-06, + "loss": 0.7346, + "mean_token_accuracy": 0.7608356475830078, + "num_tokens": 334477498.0, + "step": 13219 + }, + { + "epoch": 1.4517900285526026, + "grad_norm": 2.6307148933410645, + "learning_rate": 1e-06, + "loss": 0.886, + "mean_token_accuracy": 0.7222405076026917, + "num_tokens": 334498188.0, + "step": 13220 + }, + { + "epoch": 1.4518998462552164, + "grad_norm": 2.1940114498138428, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7166696786880493, + "num_tokens": 334525056.0, + "step": 13221 + }, + { + "epoch": 1.45200966395783, + "grad_norm": 2.561150312423706, + "learning_rate": 1e-06, + "loss": 1.0261, + "mean_token_accuracy": 0.6878734827041626, + "num_tokens": 334548403.0, + "step": 13222 + }, + { + "epoch": 1.4521194816604437, + "grad_norm": 2.3087000846862793, + "learning_rate": 1e-06, + "loss": 0.9085, + "mean_token_accuracy": 0.7160356044769287, + "num_tokens": 334572199.0, + "step": 13223 + }, + { + "epoch": 1.4522292993630574, + "grad_norm": 2.126258134841919, + "learning_rate": 1e-06, + "loss": 1.0724, + "mean_token_accuracy": 0.6781322360038757, + "num_tokens": 334601534.0, + "step": 13224 + }, + { + "epoch": 1.452339117065671, + "grad_norm": 2.149482250213623, + "learning_rate": 1e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.6971274614334106, + "num_tokens": 334630322.0, + "step": 13225 + }, + { + "epoch": 1.4524489347682845, + "grad_norm": 2.6434335708618164, + "learning_rate": 1e-06, + "loss": 0.88, + "mean_token_accuracy": 0.7257039546966553, + "num_tokens": 334651336.0, + "step": 13226 + }, + { + "epoch": 1.4525587524708983, + "grad_norm": 2.3375911712646484, + "learning_rate": 1e-06, + "loss": 0.8476, + "mean_token_accuracy": 0.73320472240448, + "num_tokens": 334672906.0, + "step": 13227 + }, + { + "epoch": 1.452668570173512, + "grad_norm": 2.032348394393921, + "learning_rate": 1e-06, + "loss": 0.8866, + "mean_token_accuracy": 0.7242091298103333, + "num_tokens": 334702700.0, + "step": 13228 + }, + { + "epoch": 1.4527783878761256, + "grad_norm": 2.2714059352874756, + "learning_rate": 1e-06, + "loss": 0.8917, + "mean_token_accuracy": 0.7203817367553711, + "num_tokens": 334729552.0, + "step": 13229 + }, + { + "epoch": 1.4528882055787393, + "grad_norm": 2.369715452194214, + "learning_rate": 1e-06, + "loss": 0.9214, + "mean_token_accuracy": 0.7161252498626709, + "num_tokens": 334754640.0, + "step": 13230 + }, + { + "epoch": 1.4529980232813529, + "grad_norm": 2.428093671798706, + "learning_rate": 1e-06, + "loss": 0.93, + "mean_token_accuracy": 0.7129011750221252, + "num_tokens": 334777177.0, + "step": 13231 + }, + { + "epoch": 1.4531078409839666, + "grad_norm": 2.169721841812134, + "learning_rate": 1e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.7019191980361938, + "num_tokens": 334803875.0, + "step": 13232 + }, + { + "epoch": 1.4532176586865804, + "grad_norm": 2.3354272842407227, + "learning_rate": 1e-06, + "loss": 0.7878, + "mean_token_accuracy": 0.7443107962608337, + "num_tokens": 334826592.0, + "step": 13233 + }, + { + "epoch": 1.453327476389194, + "grad_norm": 1.8850568532943726, + "learning_rate": 1e-06, + "loss": 0.8821, + "mean_token_accuracy": 0.7267069816589355, + "num_tokens": 334858234.0, + "step": 13234 + }, + { + "epoch": 1.4534372940918077, + "grad_norm": 2.305367946624756, + "learning_rate": 1e-06, + "loss": 0.9098, + "mean_token_accuracy": 0.7124308347702026, + "num_tokens": 334883822.0, + "step": 13235 + }, + { + "epoch": 1.4535471117944212, + "grad_norm": 2.029083490371704, + "learning_rate": 1e-06, + "loss": 0.8554, + "mean_token_accuracy": 0.7318263053894043, + "num_tokens": 334912522.0, + "step": 13236 + }, + { + "epoch": 1.453656929497035, + "grad_norm": 2.0551817417144775, + "learning_rate": 1e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.7030479311943054, + "num_tokens": 334943895.0, + "step": 13237 + }, + { + "epoch": 1.4537667471996487, + "grad_norm": 1.926377296447754, + "learning_rate": 1e-06, + "loss": 0.8533, + "mean_token_accuracy": 0.730406641960144, + "num_tokens": 334977332.0, + "step": 13238 + }, + { + "epoch": 1.4538765649022622, + "grad_norm": 2.0845956802368164, + "learning_rate": 1e-06, + "loss": 1.0163, + "mean_token_accuracy": 0.6923186779022217, + "num_tokens": 335006936.0, + "step": 13239 + }, + { + "epoch": 1.4539863826048758, + "grad_norm": 2.1918931007385254, + "learning_rate": 1e-06, + "loss": 0.8287, + "mean_token_accuracy": 0.7414013147354126, + "num_tokens": 335032468.0, + "step": 13240 + }, + { + "epoch": 1.4540962003074895, + "grad_norm": 2.2120018005371094, + "learning_rate": 1e-06, + "loss": 0.8697, + "mean_token_accuracy": 0.7242341637611389, + "num_tokens": 335059080.0, + "step": 13241 + }, + { + "epoch": 1.4542060180101033, + "grad_norm": 2.3406167030334473, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.6984083652496338, + "num_tokens": 335084721.0, + "step": 13242 + }, + { + "epoch": 1.4543158357127168, + "grad_norm": 2.2998342514038086, + "learning_rate": 1e-06, + "loss": 0.8067, + "mean_token_accuracy": 0.7399899363517761, + "num_tokens": 335108898.0, + "step": 13243 + }, + { + "epoch": 1.4544256534153306, + "grad_norm": 2.230380058288574, + "learning_rate": 1e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7167866230010986, + "num_tokens": 335135451.0, + "step": 13244 + }, + { + "epoch": 1.4545354711179441, + "grad_norm": 2.3451077938079834, + "learning_rate": 1e-06, + "loss": 0.8687, + "mean_token_accuracy": 0.7338728904724121, + "num_tokens": 335159446.0, + "step": 13245 + }, + { + "epoch": 1.4546452888205579, + "grad_norm": 2.110231399536133, + "learning_rate": 1e-06, + "loss": 0.9532, + "mean_token_accuracy": 0.7183188199996948, + "num_tokens": 335188587.0, + "step": 13246 + }, + { + "epoch": 1.4547551065231716, + "grad_norm": 2.3259406089782715, + "learning_rate": 1e-06, + "loss": 0.8695, + "mean_token_accuracy": 0.7299653887748718, + "num_tokens": 335213345.0, + "step": 13247 + }, + { + "epoch": 1.4548649242257852, + "grad_norm": 2.3321340084075928, + "learning_rate": 1e-06, + "loss": 0.9411, + "mean_token_accuracy": 0.711358904838562, + "num_tokens": 335237618.0, + "step": 13248 + }, + { + "epoch": 1.454974741928399, + "grad_norm": 2.0790867805480957, + "learning_rate": 1e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.7108412981033325, + "num_tokens": 335268975.0, + "step": 13249 + }, + { + "epoch": 1.4550845596310125, + "grad_norm": 2.304628372192383, + "learning_rate": 1e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.714631199836731, + "num_tokens": 335294037.0, + "step": 13250 + }, + { + "epoch": 1.4551943773336262, + "grad_norm": 2.063504219055176, + "learning_rate": 1e-06, + "loss": 0.8871, + "mean_token_accuracy": 0.7219946980476379, + "num_tokens": 335323143.0, + "step": 13251 + }, + { + "epoch": 1.45530419503624, + "grad_norm": 2.31819224357605, + "learning_rate": 1e-06, + "loss": 0.8979, + "mean_token_accuracy": 0.7134674787521362, + "num_tokens": 335347221.0, + "step": 13252 + }, + { + "epoch": 1.4554140127388535, + "grad_norm": 2.292682409286499, + "learning_rate": 1e-06, + "loss": 0.7733, + "mean_token_accuracy": 0.7500041127204895, + "num_tokens": 335369536.0, + "step": 13253 + }, + { + "epoch": 1.455523830441467, + "grad_norm": 2.015127658843994, + "learning_rate": 1e-06, + "loss": 1.0196, + "mean_token_accuracy": 0.6961679458618164, + "num_tokens": 335401044.0, + "step": 13254 + }, + { + "epoch": 1.4556336481440808, + "grad_norm": 2.0602943897247314, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7042988538742065, + "num_tokens": 335431825.0, + "step": 13255 + }, + { + "epoch": 1.4557434658466946, + "grad_norm": 2.313892126083374, + "learning_rate": 1e-06, + "loss": 0.8882, + "mean_token_accuracy": 0.7260158061981201, + "num_tokens": 335454489.0, + "step": 13256 + }, + { + "epoch": 1.455853283549308, + "grad_norm": 2.088817596435547, + "learning_rate": 1e-06, + "loss": 0.8047, + "mean_token_accuracy": 0.7402821779251099, + "num_tokens": 335480726.0, + "step": 13257 + }, + { + "epoch": 1.4559631012519219, + "grad_norm": 2.21753191947937, + "learning_rate": 1e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.7119200229644775, + "num_tokens": 335507018.0, + "step": 13258 + }, + { + "epoch": 1.4560729189545354, + "grad_norm": 2.3882906436920166, + "learning_rate": 1e-06, + "loss": 0.8588, + "mean_token_accuracy": 0.7324115633964539, + "num_tokens": 335530510.0, + "step": 13259 + }, + { + "epoch": 1.4561827366571491, + "grad_norm": 2.416252374649048, + "learning_rate": 1e-06, + "loss": 0.816, + "mean_token_accuracy": 0.7515091896057129, + "num_tokens": 335552735.0, + "step": 13260 + }, + { + "epoch": 1.456292554359763, + "grad_norm": 2.4207959175109863, + "learning_rate": 1e-06, + "loss": 0.9162, + "mean_token_accuracy": 0.7226947546005249, + "num_tokens": 335577272.0, + "step": 13261 + }, + { + "epoch": 1.4564023720623764, + "grad_norm": 2.2736339569091797, + "learning_rate": 1e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.7214810848236084, + "num_tokens": 335603960.0, + "step": 13262 + }, + { + "epoch": 1.4565121897649902, + "grad_norm": 2.662482738494873, + "learning_rate": 1e-06, + "loss": 0.8649, + "mean_token_accuracy": 0.7277998328208923, + "num_tokens": 335624503.0, + "step": 13263 + }, + { + "epoch": 1.4566220074676037, + "grad_norm": 2.357119560241699, + "learning_rate": 1e-06, + "loss": 0.833, + "mean_token_accuracy": 0.7366559505462646, + "num_tokens": 335648603.0, + "step": 13264 + }, + { + "epoch": 1.4567318251702175, + "grad_norm": 2.2988944053649902, + "learning_rate": 1e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.6943989992141724, + "num_tokens": 335673933.0, + "step": 13265 + }, + { + "epoch": 1.456841642872831, + "grad_norm": 2.4852654933929443, + "learning_rate": 1e-06, + "loss": 0.9196, + "mean_token_accuracy": 0.7191728353500366, + "num_tokens": 335696141.0, + "step": 13266 + }, + { + "epoch": 1.4569514605754448, + "grad_norm": 2.2272655963897705, + "learning_rate": 1e-06, + "loss": 0.8329, + "mean_token_accuracy": 0.7349309325218201, + "num_tokens": 335720430.0, + "step": 13267 + }, + { + "epoch": 1.4570612782780583, + "grad_norm": 1.8468093872070312, + "learning_rate": 1e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.6942790746688843, + "num_tokens": 335758878.0, + "step": 13268 + }, + { + "epoch": 1.457171095980672, + "grad_norm": 2.2443652153015137, + "learning_rate": 1e-06, + "loss": 0.859, + "mean_token_accuracy": 0.7245803475379944, + "num_tokens": 335784029.0, + "step": 13269 + }, + { + "epoch": 1.4572809136832858, + "grad_norm": 2.15664005279541, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7177977561950684, + "num_tokens": 335812213.0, + "step": 13270 + }, + { + "epoch": 1.4573907313858994, + "grad_norm": 2.2568764686584473, + "learning_rate": 1e-06, + "loss": 0.923, + "mean_token_accuracy": 0.7215306758880615, + "num_tokens": 335836843.0, + "step": 13271 + }, + { + "epoch": 1.4575005490885131, + "grad_norm": 2.120081663131714, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.713832676410675, + "num_tokens": 335864286.0, + "step": 13272 + }, + { + "epoch": 1.4576103667911267, + "grad_norm": 2.4287350177764893, + "learning_rate": 1e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7044504284858704, + "num_tokens": 335890002.0, + "step": 13273 + }, + { + "epoch": 1.4577201844937404, + "grad_norm": 2.3305301666259766, + "learning_rate": 1e-06, + "loss": 0.9567, + "mean_token_accuracy": 0.707956075668335, + "num_tokens": 335915786.0, + "step": 13274 + }, + { + "epoch": 1.4578300021963542, + "grad_norm": 2.122302293777466, + "learning_rate": 1e-06, + "loss": 0.8865, + "mean_token_accuracy": 0.7281122803688049, + "num_tokens": 335945044.0, + "step": 13275 + }, + { + "epoch": 1.4579398198989677, + "grad_norm": 2.2072439193725586, + "learning_rate": 1e-06, + "loss": 0.8605, + "mean_token_accuracy": 0.7340091466903687, + "num_tokens": 335969005.0, + "step": 13276 + }, + { + "epoch": 1.4580496376015812, + "grad_norm": 1.963210105895996, + "learning_rate": 1e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.7158344388008118, + "num_tokens": 336002080.0, + "step": 13277 + }, + { + "epoch": 1.458159455304195, + "grad_norm": 2.11684250831604, + "learning_rate": 1e-06, + "loss": 0.8264, + "mean_token_accuracy": 0.7402804493904114, + "num_tokens": 336030419.0, + "step": 13278 + }, + { + "epoch": 1.4582692730068088, + "grad_norm": 2.213019609451294, + "learning_rate": 1e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.6949289441108704, + "num_tokens": 336057951.0, + "step": 13279 + }, + { + "epoch": 1.4583790907094223, + "grad_norm": 2.4692349433898926, + "learning_rate": 1e-06, + "loss": 0.88, + "mean_token_accuracy": 0.721655011177063, + "num_tokens": 336080053.0, + "step": 13280 + }, + { + "epoch": 1.458488908412036, + "grad_norm": 2.109262704849243, + "learning_rate": 1e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.7048214673995972, + "num_tokens": 336109128.0, + "step": 13281 + }, + { + "epoch": 1.4585987261146496, + "grad_norm": 2.349367380142212, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.7068496942520142, + "num_tokens": 336133381.0, + "step": 13282 + }, + { + "epoch": 1.4587085438172633, + "grad_norm": 2.4272429943084717, + "learning_rate": 1e-06, + "loss": 0.8939, + "mean_token_accuracy": 0.7175803184509277, + "num_tokens": 336157396.0, + "step": 13283 + }, + { + "epoch": 1.458818361519877, + "grad_norm": 2.3921189308166504, + "learning_rate": 1e-06, + "loss": 0.892, + "mean_token_accuracy": 0.7292044758796692, + "num_tokens": 336181372.0, + "step": 13284 + }, + { + "epoch": 1.4589281792224906, + "grad_norm": 2.7593202590942383, + "learning_rate": 1e-06, + "loss": 0.8846, + "mean_token_accuracy": 0.7277075052261353, + "num_tokens": 336201775.0, + "step": 13285 + }, + { + "epoch": 1.4590379969251044, + "grad_norm": 2.5589518547058105, + "learning_rate": 1e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.7198879718780518, + "num_tokens": 336223307.0, + "step": 13286 + }, + { + "epoch": 1.459147814627718, + "grad_norm": 2.3878321647644043, + "learning_rate": 1e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.7010475397109985, + "num_tokens": 336248429.0, + "step": 13287 + }, + { + "epoch": 1.4592576323303317, + "grad_norm": 2.2389652729034424, + "learning_rate": 1e-06, + "loss": 0.8898, + "mean_token_accuracy": 0.719732403755188, + "num_tokens": 336273503.0, + "step": 13288 + }, + { + "epoch": 1.4593674500329454, + "grad_norm": 2.2280590534210205, + "learning_rate": 1e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7235218286514282, + "num_tokens": 336300550.0, + "step": 13289 + }, + { + "epoch": 1.459477267735559, + "grad_norm": 2.268718957901001, + "learning_rate": 1e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.7036728858947754, + "num_tokens": 336327057.0, + "step": 13290 + }, + { + "epoch": 1.4595870854381725, + "grad_norm": 2.0242857933044434, + "learning_rate": 1e-06, + "loss": 0.9945, + "mean_token_accuracy": 0.6964414119720459, + "num_tokens": 336356905.0, + "step": 13291 + }, + { + "epoch": 1.4596969031407863, + "grad_norm": 2.1739449501037598, + "learning_rate": 1e-06, + "loss": 0.8575, + "mean_token_accuracy": 0.730867326259613, + "num_tokens": 336383368.0, + "step": 13292 + }, + { + "epoch": 1.4598067208434, + "grad_norm": 2.3442575931549072, + "learning_rate": 1e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.722329318523407, + "num_tokens": 336405632.0, + "step": 13293 + }, + { + "epoch": 1.4599165385460136, + "grad_norm": 2.039926528930664, + "learning_rate": 1e-06, + "loss": 0.8637, + "mean_token_accuracy": 0.7366907596588135, + "num_tokens": 336433050.0, + "step": 13294 + }, + { + "epoch": 1.4600263562486273, + "grad_norm": 2.5038869380950928, + "learning_rate": 1e-06, + "loss": 0.7941, + "mean_token_accuracy": 0.7503450512886047, + "num_tokens": 336451824.0, + "step": 13295 + }, + { + "epoch": 1.4601361739512408, + "grad_norm": 2.0382730960845947, + "learning_rate": 1e-06, + "loss": 1.0199, + "mean_token_accuracy": 0.6854724884033203, + "num_tokens": 336484058.0, + "step": 13296 + }, + { + "epoch": 1.4602459916538546, + "grad_norm": 2.4346516132354736, + "learning_rate": 1e-06, + "loss": 0.9131, + "mean_token_accuracy": 0.7127822637557983, + "num_tokens": 336512229.0, + "step": 13297 + }, + { + "epoch": 1.4603558093564684, + "grad_norm": 2.2008450031280518, + "learning_rate": 1e-06, + "loss": 0.8938, + "mean_token_accuracy": 0.7316944599151611, + "num_tokens": 336541648.0, + "step": 13298 + }, + { + "epoch": 1.460465627059082, + "grad_norm": 1.8523263931274414, + "learning_rate": 1e-06, + "loss": 0.8616, + "mean_token_accuracy": 0.7291056513786316, + "num_tokens": 336577861.0, + "step": 13299 + }, + { + "epoch": 1.4605754447616957, + "grad_norm": 2.478830575942993, + "learning_rate": 1e-06, + "loss": 0.8508, + "mean_token_accuracy": 0.7308916449546814, + "num_tokens": 336599187.0, + "step": 13300 + }, + { + "epoch": 1.4606852624643092, + "grad_norm": 1.9991495609283447, + "learning_rate": 1e-06, + "loss": 0.9104, + "mean_token_accuracy": 0.7169222235679626, + "num_tokens": 336628205.0, + "step": 13301 + }, + { + "epoch": 1.460795080166923, + "grad_norm": 2.398158073425293, + "learning_rate": 1e-06, + "loss": 0.8812, + "mean_token_accuracy": 0.7233839631080627, + "num_tokens": 336652567.0, + "step": 13302 + }, + { + "epoch": 1.4609048978695367, + "grad_norm": 2.198795795440674, + "learning_rate": 1e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.726127028465271, + "num_tokens": 336680269.0, + "step": 13303 + }, + { + "epoch": 1.4610147155721502, + "grad_norm": 2.463351249694824, + "learning_rate": 1e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7298221588134766, + "num_tokens": 336701660.0, + "step": 13304 + }, + { + "epoch": 1.4611245332747638, + "grad_norm": 2.312041997909546, + "learning_rate": 1e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.7190423607826233, + "num_tokens": 336726675.0, + "step": 13305 + }, + { + "epoch": 1.4612343509773775, + "grad_norm": 2.2480194568634033, + "learning_rate": 1e-06, + "loss": 1.0018, + "mean_token_accuracy": 0.693148672580719, + "num_tokens": 336756698.0, + "step": 13306 + }, + { + "epoch": 1.4613441686799913, + "grad_norm": 2.15364933013916, + "learning_rate": 1e-06, + "loss": 0.9013, + "mean_token_accuracy": 0.7213823199272156, + "num_tokens": 336783724.0, + "step": 13307 + }, + { + "epoch": 1.4614539863826048, + "grad_norm": 2.0904364585876465, + "learning_rate": 1e-06, + "loss": 0.8107, + "mean_token_accuracy": 0.7391916513442993, + "num_tokens": 336811366.0, + "step": 13308 + }, + { + "epoch": 1.4615638040852186, + "grad_norm": 2.5602221488952637, + "learning_rate": 1e-06, + "loss": 0.7798, + "mean_token_accuracy": 0.7469415664672852, + "num_tokens": 336830401.0, + "step": 13309 + }, + { + "epoch": 1.4616736217878321, + "grad_norm": 2.196800708770752, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.714921236038208, + "num_tokens": 336855172.0, + "step": 13310 + }, + { + "epoch": 1.4617834394904459, + "grad_norm": 2.081014394760132, + "learning_rate": 1e-06, + "loss": 0.9704, + "mean_token_accuracy": 0.7024667263031006, + "num_tokens": 336885545.0, + "step": 13311 + }, + { + "epoch": 1.4618932571930596, + "grad_norm": 2.1630942821502686, + "learning_rate": 1e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7166435122489929, + "num_tokens": 336913369.0, + "step": 13312 + }, + { + "epoch": 1.4620030748956732, + "grad_norm": 2.3277831077575684, + "learning_rate": 1e-06, + "loss": 0.9022, + "mean_token_accuracy": 0.7250086665153503, + "num_tokens": 336937863.0, + "step": 13313 + }, + { + "epoch": 1.462112892598287, + "grad_norm": 2.4260683059692383, + "learning_rate": 1e-06, + "loss": 1.0189, + "mean_token_accuracy": 0.6931219100952148, + "num_tokens": 336961563.0, + "step": 13314 + }, + { + "epoch": 1.4622227103009005, + "grad_norm": 2.276345729827881, + "learning_rate": 1e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.7182238698005676, + "num_tokens": 336986640.0, + "step": 13315 + }, + { + "epoch": 1.4623325280035142, + "grad_norm": 2.409245014190674, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7189797759056091, + "num_tokens": 337010262.0, + "step": 13316 + }, + { + "epoch": 1.462442345706128, + "grad_norm": 2.4064342975616455, + "learning_rate": 1e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.7185131311416626, + "num_tokens": 337034542.0, + "step": 13317 + }, + { + "epoch": 1.4625521634087415, + "grad_norm": 2.373645305633545, + "learning_rate": 1e-06, + "loss": 0.9037, + "mean_token_accuracy": 0.7120476961135864, + "num_tokens": 337058971.0, + "step": 13318 + }, + { + "epoch": 1.462661981111355, + "grad_norm": 2.4177684783935547, + "learning_rate": 1e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.7150218486785889, + "num_tokens": 337084066.0, + "step": 13319 + }, + { + "epoch": 1.4627717988139688, + "grad_norm": 2.278186082839966, + "learning_rate": 1e-06, + "loss": 0.892, + "mean_token_accuracy": 0.7234402894973755, + "num_tokens": 337111910.0, + "step": 13320 + }, + { + "epoch": 1.4628816165165826, + "grad_norm": 2.387422800064087, + "learning_rate": 1e-06, + "loss": 0.854, + "mean_token_accuracy": 0.7315396666526794, + "num_tokens": 337134673.0, + "step": 13321 + }, + { + "epoch": 1.462991434219196, + "grad_norm": 2.1447601318359375, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7157228589057922, + "num_tokens": 337164436.0, + "step": 13322 + }, + { + "epoch": 1.4631012519218098, + "grad_norm": 2.308685541152954, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7192783355712891, + "num_tokens": 337189057.0, + "step": 13323 + }, + { + "epoch": 1.4632110696244234, + "grad_norm": 2.517531394958496, + "learning_rate": 1e-06, + "loss": 0.8158, + "mean_token_accuracy": 0.745404839515686, + "num_tokens": 337212213.0, + "step": 13324 + }, + { + "epoch": 1.4633208873270371, + "grad_norm": 2.4431676864624023, + "learning_rate": 1e-06, + "loss": 0.8597, + "mean_token_accuracy": 0.7292375564575195, + "num_tokens": 337235701.0, + "step": 13325 + }, + { + "epoch": 1.463430705029651, + "grad_norm": 2.488858222961426, + "learning_rate": 1e-06, + "loss": 0.8653, + "mean_token_accuracy": 0.7317192554473877, + "num_tokens": 337256778.0, + "step": 13326 + }, + { + "epoch": 1.4635405227322644, + "grad_norm": 2.21832013130188, + "learning_rate": 1e-06, + "loss": 0.9309, + "mean_token_accuracy": 0.7144973278045654, + "num_tokens": 337284273.0, + "step": 13327 + }, + { + "epoch": 1.4636503404348782, + "grad_norm": 2.180342197418213, + "learning_rate": 1e-06, + "loss": 0.9645, + "mean_token_accuracy": 0.7079771757125854, + "num_tokens": 337313201.0, + "step": 13328 + }, + { + "epoch": 1.4637601581374917, + "grad_norm": 2.2543678283691406, + "learning_rate": 1e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.7150315046310425, + "num_tokens": 337337513.0, + "step": 13329 + }, + { + "epoch": 1.4638699758401055, + "grad_norm": 2.4335200786590576, + "learning_rate": 1e-06, + "loss": 0.8485, + "mean_token_accuracy": 0.7382614016532898, + "num_tokens": 337359585.0, + "step": 13330 + }, + { + "epoch": 1.463979793542719, + "grad_norm": 2.2979273796081543, + "learning_rate": 1e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7165853977203369, + "num_tokens": 337386526.0, + "step": 13331 + }, + { + "epoch": 1.4640896112453328, + "grad_norm": 2.1697239875793457, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.7159459590911865, + "num_tokens": 337413594.0, + "step": 13332 + }, + { + "epoch": 1.4641994289479463, + "grad_norm": 2.58121919631958, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7193050384521484, + "num_tokens": 337434893.0, + "step": 13333 + }, + { + "epoch": 1.46430924665056, + "grad_norm": 2.322068691253662, + "learning_rate": 1e-06, + "loss": 0.8, + "mean_token_accuracy": 0.7446640729904175, + "num_tokens": 337459252.0, + "step": 13334 + }, + { + "epoch": 1.4644190643531738, + "grad_norm": 2.3613386154174805, + "learning_rate": 1e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.701146125793457, + "num_tokens": 337483618.0, + "step": 13335 + }, + { + "epoch": 1.4645288820557874, + "grad_norm": 2.504746198654175, + "learning_rate": 1e-06, + "loss": 0.8433, + "mean_token_accuracy": 0.7374148368835449, + "num_tokens": 337504711.0, + "step": 13336 + }, + { + "epoch": 1.464638699758401, + "grad_norm": 2.2173573970794678, + "learning_rate": 1e-06, + "loss": 0.8885, + "mean_token_accuracy": 0.7276803851127625, + "num_tokens": 337530363.0, + "step": 13337 + }, + { + "epoch": 1.4647485174610146, + "grad_norm": 2.326045513153076, + "learning_rate": 1e-06, + "loss": 0.81, + "mean_token_accuracy": 0.7471622228622437, + "num_tokens": 337553834.0, + "step": 13338 + }, + { + "epoch": 1.4648583351636284, + "grad_norm": 2.202738046646118, + "learning_rate": 1e-06, + "loss": 0.887, + "mean_token_accuracy": 0.7258968353271484, + "num_tokens": 337581410.0, + "step": 13339 + }, + { + "epoch": 1.4649681528662422, + "grad_norm": 2.586714506149292, + "learning_rate": 1e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.7165812849998474, + "num_tokens": 337602224.0, + "step": 13340 + }, + { + "epoch": 1.4650779705688557, + "grad_norm": 2.603607177734375, + "learning_rate": 1e-06, + "loss": 0.8639, + "mean_token_accuracy": 0.72549968957901, + "num_tokens": 337623372.0, + "step": 13341 + }, + { + "epoch": 1.4651877882714692, + "grad_norm": 2.532757043838501, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7018411159515381, + "num_tokens": 337645848.0, + "step": 13342 + }, + { + "epoch": 1.465297605974083, + "grad_norm": 2.2399537563323975, + "learning_rate": 1e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7116982936859131, + "num_tokens": 337672520.0, + "step": 13343 + }, + { + "epoch": 1.4654074236766967, + "grad_norm": 2.1970887184143066, + "learning_rate": 1e-06, + "loss": 0.9068, + "mean_token_accuracy": 0.7251650094985962, + "num_tokens": 337700019.0, + "step": 13344 + }, + { + "epoch": 1.4655172413793103, + "grad_norm": 2.0596258640289307, + "learning_rate": 1e-06, + "loss": 0.997, + "mean_token_accuracy": 0.6947222948074341, + "num_tokens": 337730815.0, + "step": 13345 + }, + { + "epoch": 1.465627059081924, + "grad_norm": 2.137399911880493, + "learning_rate": 1e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.7235426902770996, + "num_tokens": 337758719.0, + "step": 13346 + }, + { + "epoch": 1.4657368767845376, + "grad_norm": 2.4966752529144287, + "learning_rate": 1e-06, + "loss": 0.7869, + "mean_token_accuracy": 0.7459713220596313, + "num_tokens": 337778348.0, + "step": 13347 + }, + { + "epoch": 1.4658466944871513, + "grad_norm": 2.093757152557373, + "learning_rate": 1e-06, + "loss": 0.9254, + "mean_token_accuracy": 0.7149798274040222, + "num_tokens": 337806938.0, + "step": 13348 + }, + { + "epoch": 1.465956512189765, + "grad_norm": 2.2445483207702637, + "learning_rate": 1e-06, + "loss": 0.8736, + "mean_token_accuracy": 0.7403589487075806, + "num_tokens": 337831207.0, + "step": 13349 + }, + { + "epoch": 1.4660663298923786, + "grad_norm": 2.199556589126587, + "learning_rate": 1e-06, + "loss": 0.7839, + "mean_token_accuracy": 0.7440360188484192, + "num_tokens": 337856472.0, + "step": 13350 + }, + { + "epoch": 1.4661761475949924, + "grad_norm": 1.9774994850158691, + "learning_rate": 1e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7101657390594482, + "num_tokens": 337887026.0, + "step": 13351 + }, + { + "epoch": 1.466285965297606, + "grad_norm": 2.321373462677002, + "learning_rate": 1e-06, + "loss": 0.9362, + "mean_token_accuracy": 0.7130335569381714, + "num_tokens": 337912408.0, + "step": 13352 + }, + { + "epoch": 1.4663957830002197, + "grad_norm": 2.489093542098999, + "learning_rate": 1e-06, + "loss": 0.8615, + "mean_token_accuracy": 0.7258701324462891, + "num_tokens": 337933537.0, + "step": 13353 + }, + { + "epoch": 1.4665056007028334, + "grad_norm": 2.220083713531494, + "learning_rate": 1e-06, + "loss": 0.9225, + "mean_token_accuracy": 0.7190083265304565, + "num_tokens": 337961413.0, + "step": 13354 + }, + { + "epoch": 1.466615418405447, + "grad_norm": 2.191100835800171, + "learning_rate": 1e-06, + "loss": 0.8614, + "mean_token_accuracy": 0.7339776754379272, + "num_tokens": 337990176.0, + "step": 13355 + }, + { + "epoch": 1.4667252361080605, + "grad_norm": 2.2521934509277344, + "learning_rate": 1e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.7213133573532104, + "num_tokens": 338016652.0, + "step": 13356 + }, + { + "epoch": 1.4668350538106742, + "grad_norm": 2.175097703933716, + "learning_rate": 1e-06, + "loss": 0.9817, + "mean_token_accuracy": 0.6968157291412354, + "num_tokens": 338047026.0, + "step": 13357 + }, + { + "epoch": 1.466944871513288, + "grad_norm": 2.198444128036499, + "learning_rate": 1e-06, + "loss": 0.9233, + "mean_token_accuracy": 0.7110322713851929, + "num_tokens": 338073596.0, + "step": 13358 + }, + { + "epoch": 1.4670546892159015, + "grad_norm": 2.475741386413574, + "learning_rate": 1e-06, + "loss": 0.8233, + "mean_token_accuracy": 0.7385127544403076, + "num_tokens": 338094749.0, + "step": 13359 + }, + { + "epoch": 1.4671645069185153, + "grad_norm": 2.319448471069336, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7075499892234802, + "num_tokens": 338119016.0, + "step": 13360 + }, + { + "epoch": 1.4672743246211288, + "grad_norm": 2.532280206680298, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7050966024398804, + "num_tokens": 338139909.0, + "step": 13361 + }, + { + "epoch": 1.4673841423237426, + "grad_norm": 2.3658676147460938, + "learning_rate": 1e-06, + "loss": 0.8415, + "mean_token_accuracy": 0.7383041977882385, + "num_tokens": 338161442.0, + "step": 13362 + }, + { + "epoch": 1.4674939600263563, + "grad_norm": 2.0527889728546143, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.6932398080825806, + "num_tokens": 338189385.0, + "step": 13363 + }, + { + "epoch": 1.4676037777289699, + "grad_norm": 2.3003060817718506, + "learning_rate": 1e-06, + "loss": 0.8722, + "mean_token_accuracy": 0.7227656841278076, + "num_tokens": 338214068.0, + "step": 13364 + }, + { + "epoch": 1.4677135954315836, + "grad_norm": 1.9388412237167358, + "learning_rate": 1e-06, + "loss": 0.8746, + "mean_token_accuracy": 0.7275952100753784, + "num_tokens": 338245041.0, + "step": 13365 + }, + { + "epoch": 1.4678234131341972, + "grad_norm": 2.3329415321350098, + "learning_rate": 1e-06, + "loss": 0.9054, + "mean_token_accuracy": 0.7257806062698364, + "num_tokens": 338269031.0, + "step": 13366 + }, + { + "epoch": 1.467933230836811, + "grad_norm": 2.48677659034729, + "learning_rate": 1e-06, + "loss": 0.8536, + "mean_token_accuracy": 0.7413737773895264, + "num_tokens": 338290475.0, + "step": 13367 + }, + { + "epoch": 1.4680430485394247, + "grad_norm": 2.2039108276367188, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7100292444229126, + "num_tokens": 338316374.0, + "step": 13368 + }, + { + "epoch": 1.4681528662420382, + "grad_norm": 2.6126346588134766, + "learning_rate": 1e-06, + "loss": 0.9776, + "mean_token_accuracy": 0.7042739391326904, + "num_tokens": 338337334.0, + "step": 13369 + }, + { + "epoch": 1.4682626839446518, + "grad_norm": 2.224179267883301, + "learning_rate": 1e-06, + "loss": 0.8807, + "mean_token_accuracy": 0.7236879467964172, + "num_tokens": 338363727.0, + "step": 13370 + }, + { + "epoch": 1.4683725016472655, + "grad_norm": 2.276275873184204, + "learning_rate": 1e-06, + "loss": 0.887, + "mean_token_accuracy": 0.734063982963562, + "num_tokens": 338389323.0, + "step": 13371 + }, + { + "epoch": 1.4684823193498793, + "grad_norm": 2.3962087631225586, + "learning_rate": 1e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.725245475769043, + "num_tokens": 338411382.0, + "step": 13372 + }, + { + "epoch": 1.4685921370524928, + "grad_norm": 2.123159408569336, + "learning_rate": 1e-06, + "loss": 0.934, + "mean_token_accuracy": 0.7057530283927917, + "num_tokens": 338440209.0, + "step": 13373 + }, + { + "epoch": 1.4687019547551066, + "grad_norm": 2.4774577617645264, + "learning_rate": 1e-06, + "loss": 0.91, + "mean_token_accuracy": 0.7164005041122437, + "num_tokens": 338461664.0, + "step": 13374 + }, + { + "epoch": 1.46881177245772, + "grad_norm": 2.370738983154297, + "learning_rate": 1e-06, + "loss": 0.8948, + "mean_token_accuracy": 0.7183752059936523, + "num_tokens": 338485661.0, + "step": 13375 + }, + { + "epoch": 1.4689215901603339, + "grad_norm": 2.3719279766082764, + "learning_rate": 1e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.7053289413452148, + "num_tokens": 338512058.0, + "step": 13376 + }, + { + "epoch": 1.4690314078629476, + "grad_norm": 2.01789927482605, + "learning_rate": 1e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.7156275510787964, + "num_tokens": 338542925.0, + "step": 13377 + }, + { + "epoch": 1.4691412255655611, + "grad_norm": 2.3124823570251465, + "learning_rate": 1e-06, + "loss": 0.7542, + "mean_token_accuracy": 0.7533742189407349, + "num_tokens": 338565459.0, + "step": 13378 + }, + { + "epoch": 1.469251043268175, + "grad_norm": 2.195812702178955, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.7196592092514038, + "num_tokens": 338591268.0, + "step": 13379 + }, + { + "epoch": 1.4693608609707884, + "grad_norm": 2.2197043895721436, + "learning_rate": 1e-06, + "loss": 0.8487, + "mean_token_accuracy": 0.7370638251304626, + "num_tokens": 338617080.0, + "step": 13380 + }, + { + "epoch": 1.4694706786734022, + "grad_norm": 2.3053054809570312, + "learning_rate": 1e-06, + "loss": 0.8921, + "mean_token_accuracy": 0.7186950445175171, + "num_tokens": 338641548.0, + "step": 13381 + }, + { + "epoch": 1.4695804963760157, + "grad_norm": 2.3359246253967285, + "learning_rate": 1e-06, + "loss": 0.8938, + "mean_token_accuracy": 0.7274693846702576, + "num_tokens": 338665579.0, + "step": 13382 + }, + { + "epoch": 1.4696903140786295, + "grad_norm": 2.065549373626709, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7163246273994446, + "num_tokens": 338696585.0, + "step": 13383 + }, + { + "epoch": 1.469800131781243, + "grad_norm": 1.92472243309021, + "learning_rate": 1e-06, + "loss": 0.8792, + "mean_token_accuracy": 0.7249345183372498, + "num_tokens": 338729700.0, + "step": 13384 + }, + { + "epoch": 1.4699099494838568, + "grad_norm": 2.1077709197998047, + "learning_rate": 1e-06, + "loss": 0.9115, + "mean_token_accuracy": 0.7192429900169373, + "num_tokens": 338756244.0, + "step": 13385 + }, + { + "epoch": 1.4700197671864705, + "grad_norm": 2.042726755142212, + "learning_rate": 1e-06, + "loss": 0.9856, + "mean_token_accuracy": 0.6968688368797302, + "num_tokens": 338787990.0, + "step": 13386 + }, + { + "epoch": 1.470129584889084, + "grad_norm": 2.271408796310425, + "learning_rate": 1e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.71773362159729, + "num_tokens": 338814413.0, + "step": 13387 + }, + { + "epoch": 1.4702394025916978, + "grad_norm": 2.2215795516967773, + "learning_rate": 1e-06, + "loss": 0.8557, + "mean_token_accuracy": 0.7330983281135559, + "num_tokens": 338838780.0, + "step": 13388 + }, + { + "epoch": 1.4703492202943114, + "grad_norm": 2.0386452674865723, + "learning_rate": 1e-06, + "loss": 0.976, + "mean_token_accuracy": 0.7053883075714111, + "num_tokens": 338871841.0, + "step": 13389 + }, + { + "epoch": 1.4704590379969251, + "grad_norm": 2.3842837810516357, + "learning_rate": 1e-06, + "loss": 0.905, + "mean_token_accuracy": 0.7249339818954468, + "num_tokens": 338896630.0, + "step": 13390 + }, + { + "epoch": 1.4705688556995389, + "grad_norm": 2.4519600868225098, + "learning_rate": 1e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.7196590900421143, + "num_tokens": 338921233.0, + "step": 13391 + }, + { + "epoch": 1.4706786734021524, + "grad_norm": 2.138763427734375, + "learning_rate": 1e-06, + "loss": 0.922, + "mean_token_accuracy": 0.7213618755340576, + "num_tokens": 338949681.0, + "step": 13392 + }, + { + "epoch": 1.4707884911047662, + "grad_norm": 2.3150694370269775, + "learning_rate": 1e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.7132718563079834, + "num_tokens": 338973384.0, + "step": 13393 + }, + { + "epoch": 1.4708983088073797, + "grad_norm": 2.044151544570923, + "learning_rate": 1e-06, + "loss": 1.0079, + "mean_token_accuracy": 0.6955511569976807, + "num_tokens": 339004421.0, + "step": 13394 + }, + { + "epoch": 1.4710081265099935, + "grad_norm": 2.743804454803467, + "learning_rate": 1e-06, + "loss": 0.8059, + "mean_token_accuracy": 0.7442724704742432, + "num_tokens": 339023104.0, + "step": 13395 + }, + { + "epoch": 1.471117944212607, + "grad_norm": 2.1843929290771484, + "learning_rate": 1e-06, + "loss": 0.9105, + "mean_token_accuracy": 0.713647186756134, + "num_tokens": 339051530.0, + "step": 13396 + }, + { + "epoch": 1.4712277619152208, + "grad_norm": 2.5442209243774414, + "learning_rate": 1e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7213181853294373, + "num_tokens": 339072926.0, + "step": 13397 + }, + { + "epoch": 1.4713375796178343, + "grad_norm": 2.152466058731079, + "learning_rate": 1e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7178248167037964, + "num_tokens": 339099540.0, + "step": 13398 + }, + { + "epoch": 1.471447397320448, + "grad_norm": 2.649689197540283, + "learning_rate": 1e-06, + "loss": 0.8714, + "mean_token_accuracy": 0.7224440574645996, + "num_tokens": 339117716.0, + "step": 13399 + }, + { + "epoch": 1.4715572150230618, + "grad_norm": 2.002413034439087, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7130178213119507, + "num_tokens": 339149218.0, + "step": 13400 + }, + { + "epoch": 1.4716670327256753, + "grad_norm": 2.517367362976074, + "learning_rate": 1e-06, + "loss": 0.8314, + "mean_token_accuracy": 0.7360544204711914, + "num_tokens": 339169664.0, + "step": 13401 + }, + { + "epoch": 1.471776850428289, + "grad_norm": 2.1563544273376465, + "learning_rate": 1e-06, + "loss": 0.8982, + "mean_token_accuracy": 0.7247146368026733, + "num_tokens": 339196468.0, + "step": 13402 + }, + { + "epoch": 1.4718866681309026, + "grad_norm": 2.4370310306549072, + "learning_rate": 1e-06, + "loss": 0.7459, + "mean_token_accuracy": 0.7670047879219055, + "num_tokens": 339217621.0, + "step": 13403 + }, + { + "epoch": 1.4719964858335164, + "grad_norm": 2.3973352909088135, + "learning_rate": 1e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.7081453800201416, + "num_tokens": 339242426.0, + "step": 13404 + }, + { + "epoch": 1.4721063035361301, + "grad_norm": 2.3342843055725098, + "learning_rate": 1e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.7096893191337585, + "num_tokens": 339266561.0, + "step": 13405 + }, + { + "epoch": 1.4722161212387437, + "grad_norm": 2.593855381011963, + "learning_rate": 1e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.7186710834503174, + "num_tokens": 339287871.0, + "step": 13406 + }, + { + "epoch": 1.4723259389413572, + "grad_norm": 2.400141477584839, + "learning_rate": 1e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.7253454923629761, + "num_tokens": 339311619.0, + "step": 13407 + }, + { + "epoch": 1.472435756643971, + "grad_norm": 2.1782939434051514, + "learning_rate": 1e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.6972543001174927, + "num_tokens": 339342439.0, + "step": 13408 + }, + { + "epoch": 1.4725455743465847, + "grad_norm": 2.151744842529297, + "learning_rate": 1e-06, + "loss": 0.8636, + "mean_token_accuracy": 0.7295575737953186, + "num_tokens": 339369753.0, + "step": 13409 + }, + { + "epoch": 1.4726553920491983, + "grad_norm": 2.014292001724243, + "learning_rate": 1e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7134461998939514, + "num_tokens": 339400403.0, + "step": 13410 + }, + { + "epoch": 1.472765209751812, + "grad_norm": 2.3243675231933594, + "learning_rate": 1e-06, + "loss": 0.8844, + "mean_token_accuracy": 0.7217648029327393, + "num_tokens": 339422994.0, + "step": 13411 + }, + { + "epoch": 1.4728750274544256, + "grad_norm": 1.8620551824569702, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7126286029815674, + "num_tokens": 339455960.0, + "step": 13412 + }, + { + "epoch": 1.4729848451570393, + "grad_norm": 2.1176509857177734, + "learning_rate": 1e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.7278122901916504, + "num_tokens": 339483903.0, + "step": 13413 + }, + { + "epoch": 1.473094662859653, + "grad_norm": 2.036501407623291, + "learning_rate": 1e-06, + "loss": 0.889, + "mean_token_accuracy": 0.7199994325637817, + "num_tokens": 339511721.0, + "step": 13414 + }, + { + "epoch": 1.4732044805622666, + "grad_norm": 2.0755035877227783, + "learning_rate": 1e-06, + "loss": 0.9202, + "mean_token_accuracy": 0.7128698825836182, + "num_tokens": 339542255.0, + "step": 13415 + }, + { + "epoch": 1.4733142982648804, + "grad_norm": 2.254070281982422, + "learning_rate": 1e-06, + "loss": 0.8143, + "mean_token_accuracy": 0.7450203895568848, + "num_tokens": 339567282.0, + "step": 13416 + }, + { + "epoch": 1.473424115967494, + "grad_norm": 2.4743077754974365, + "learning_rate": 1e-06, + "loss": 0.8099, + "mean_token_accuracy": 0.7431477904319763, + "num_tokens": 339588120.0, + "step": 13417 + }, + { + "epoch": 1.4735339336701077, + "grad_norm": 2.1072328090667725, + "learning_rate": 1e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7174593806266785, + "num_tokens": 339615005.0, + "step": 13418 + }, + { + "epoch": 1.4736437513727214, + "grad_norm": 1.9731738567352295, + "learning_rate": 1e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.7128952741622925, + "num_tokens": 339646597.0, + "step": 13419 + }, + { + "epoch": 1.473753569075335, + "grad_norm": 2.109769821166992, + "learning_rate": 1e-06, + "loss": 0.9206, + "mean_token_accuracy": 0.7166566848754883, + "num_tokens": 339672915.0, + "step": 13420 + }, + { + "epoch": 1.4738633867779485, + "grad_norm": 2.3435251712799072, + "learning_rate": 1e-06, + "loss": 0.9279, + "mean_token_accuracy": 0.715660810470581, + "num_tokens": 339696783.0, + "step": 13421 + }, + { + "epoch": 1.4739732044805622, + "grad_norm": 2.080190420150757, + "learning_rate": 1e-06, + "loss": 0.927, + "mean_token_accuracy": 0.7076531052589417, + "num_tokens": 339726940.0, + "step": 13422 + }, + { + "epoch": 1.474083022183176, + "grad_norm": 2.768099546432495, + "learning_rate": 1e-06, + "loss": 0.8102, + "mean_token_accuracy": 0.7420356273651123, + "num_tokens": 339743146.0, + "step": 13423 + }, + { + "epoch": 1.4741928398857895, + "grad_norm": 2.735978841781616, + "learning_rate": 1e-06, + "loss": 0.817, + "mean_token_accuracy": 0.7388548851013184, + "num_tokens": 339763143.0, + "step": 13424 + }, + { + "epoch": 1.4743026575884033, + "grad_norm": 1.9911530017852783, + "learning_rate": 1e-06, + "loss": 1.0114, + "mean_token_accuracy": 0.6813532114028931, + "num_tokens": 339796036.0, + "step": 13425 + }, + { + "epoch": 1.4744124752910168, + "grad_norm": 2.288526773452759, + "learning_rate": 1e-06, + "loss": 0.9665, + "mean_token_accuracy": 0.7022049427032471, + "num_tokens": 339822520.0, + "step": 13426 + }, + { + "epoch": 1.4745222929936306, + "grad_norm": 2.489971399307251, + "learning_rate": 1e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7124426364898682, + "num_tokens": 339845027.0, + "step": 13427 + }, + { + "epoch": 1.4746321106962443, + "grad_norm": 2.192819833755493, + "learning_rate": 1e-06, + "loss": 0.8832, + "mean_token_accuracy": 0.721265435218811, + "num_tokens": 339872003.0, + "step": 13428 + }, + { + "epoch": 1.4747419283988579, + "grad_norm": 2.0854763984680176, + "learning_rate": 1e-06, + "loss": 0.8777, + "mean_token_accuracy": 0.722724437713623, + "num_tokens": 339900223.0, + "step": 13429 + }, + { + "epoch": 1.4748517461014716, + "grad_norm": 2.2878105640411377, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7058327198028564, + "num_tokens": 339926812.0, + "step": 13430 + }, + { + "epoch": 1.4749615638040852, + "grad_norm": 2.2884573936462402, + "learning_rate": 1e-06, + "loss": 0.8992, + "mean_token_accuracy": 0.722651481628418, + "num_tokens": 339951504.0, + "step": 13431 + }, + { + "epoch": 1.475071381506699, + "grad_norm": 2.539323329925537, + "learning_rate": 1e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.721314549446106, + "num_tokens": 339971572.0, + "step": 13432 + }, + { + "epoch": 1.4751811992093127, + "grad_norm": 2.267946481704712, + "learning_rate": 1e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.7129874229431152, + "num_tokens": 339996334.0, + "step": 13433 + }, + { + "epoch": 1.4752910169119262, + "grad_norm": 1.9973294734954834, + "learning_rate": 1e-06, + "loss": 0.9872, + "mean_token_accuracy": 0.7003269195556641, + "num_tokens": 340027054.0, + "step": 13434 + }, + { + "epoch": 1.4754008346145397, + "grad_norm": 2.4752323627471924, + "learning_rate": 1e-06, + "loss": 0.7922, + "mean_token_accuracy": 0.747627317905426, + "num_tokens": 340048008.0, + "step": 13435 + }, + { + "epoch": 1.4755106523171535, + "grad_norm": 2.4649498462677, + "learning_rate": 1e-06, + "loss": 0.8417, + "mean_token_accuracy": 0.7340877056121826, + "num_tokens": 340069441.0, + "step": 13436 + }, + { + "epoch": 1.4756204700197673, + "grad_norm": 2.2100610733032227, + "learning_rate": 1e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.7171680927276611, + "num_tokens": 340095878.0, + "step": 13437 + }, + { + "epoch": 1.4757302877223808, + "grad_norm": 2.384319543838501, + "learning_rate": 1e-06, + "loss": 0.7941, + "mean_token_accuracy": 0.7438780069351196, + "num_tokens": 340117242.0, + "step": 13438 + }, + { + "epoch": 1.4758401054249946, + "grad_norm": 2.092736005783081, + "learning_rate": 1e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.7037810683250427, + "num_tokens": 340145706.0, + "step": 13439 + }, + { + "epoch": 1.475949923127608, + "grad_norm": 2.04266619682312, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7249089479446411, + "num_tokens": 340177610.0, + "step": 13440 + }, + { + "epoch": 1.4760597408302218, + "grad_norm": 2.501389503479004, + "learning_rate": 1e-06, + "loss": 0.8905, + "mean_token_accuracy": 0.7224723100662231, + "num_tokens": 340199366.0, + "step": 13441 + }, + { + "epoch": 1.4761695585328356, + "grad_norm": 2.0879194736480713, + "learning_rate": 1e-06, + "loss": 1.0123, + "mean_token_accuracy": 0.6921321153640747, + "num_tokens": 340230320.0, + "step": 13442 + }, + { + "epoch": 1.4762793762354491, + "grad_norm": 2.4632625579833984, + "learning_rate": 1e-06, + "loss": 0.8794, + "mean_token_accuracy": 0.7289828062057495, + "num_tokens": 340252707.0, + "step": 13443 + }, + { + "epoch": 1.476389193938063, + "grad_norm": 2.0579960346221924, + "learning_rate": 1e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7299963235855103, + "num_tokens": 340281942.0, + "step": 13444 + }, + { + "epoch": 1.4764990116406764, + "grad_norm": 2.4517645835876465, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7075658440589905, + "num_tokens": 340306195.0, + "step": 13445 + }, + { + "epoch": 1.4766088293432902, + "grad_norm": 2.3253586292266846, + "learning_rate": 1e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.7169164419174194, + "num_tokens": 340332740.0, + "step": 13446 + }, + { + "epoch": 1.4767186470459037, + "grad_norm": 2.551539182662964, + "learning_rate": 1e-06, + "loss": 0.8444, + "mean_token_accuracy": 0.7309274673461914, + "num_tokens": 340352852.0, + "step": 13447 + }, + { + "epoch": 1.4768284647485175, + "grad_norm": 2.267136812210083, + "learning_rate": 1e-06, + "loss": 0.8875, + "mean_token_accuracy": 0.7217510342597961, + "num_tokens": 340378564.0, + "step": 13448 + }, + { + "epoch": 1.476938282451131, + "grad_norm": 2.204188585281372, + "learning_rate": 1e-06, + "loss": 0.8748, + "mean_token_accuracy": 0.7268522381782532, + "num_tokens": 340405184.0, + "step": 13449 + }, + { + "epoch": 1.4770481001537448, + "grad_norm": 2.775481939315796, + "learning_rate": 1e-06, + "loss": 0.7706, + "mean_token_accuracy": 0.746575117111206, + "num_tokens": 340422600.0, + "step": 13450 + }, + { + "epoch": 1.4771579178563585, + "grad_norm": 2.4171745777130127, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.7185590267181396, + "num_tokens": 340446664.0, + "step": 13451 + }, + { + "epoch": 1.477267735558972, + "grad_norm": 2.4848337173461914, + "learning_rate": 1e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.7201191186904907, + "num_tokens": 340468232.0, + "step": 13452 + }, + { + "epoch": 1.4773775532615858, + "grad_norm": 2.499638795852661, + "learning_rate": 1e-06, + "loss": 0.8691, + "mean_token_accuracy": 0.7278683185577393, + "num_tokens": 340488846.0, + "step": 13453 + }, + { + "epoch": 1.4774873709641994, + "grad_norm": 2.092698574066162, + "learning_rate": 1e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7035731077194214, + "num_tokens": 340517412.0, + "step": 13454 + }, + { + "epoch": 1.477597188666813, + "grad_norm": 2.645648956298828, + "learning_rate": 1e-06, + "loss": 0.8704, + "mean_token_accuracy": 0.7252227067947388, + "num_tokens": 340536862.0, + "step": 13455 + }, + { + "epoch": 1.4777070063694269, + "grad_norm": 2.6041016578674316, + "learning_rate": 1e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.7121891975402832, + "num_tokens": 340557460.0, + "step": 13456 + }, + { + "epoch": 1.4778168240720404, + "grad_norm": 1.9642637968063354, + "learning_rate": 1e-06, + "loss": 0.9224, + "mean_token_accuracy": 0.7113299369812012, + "num_tokens": 340588103.0, + "step": 13457 + }, + { + "epoch": 1.477926641774654, + "grad_norm": 2.1578972339630127, + "learning_rate": 1e-06, + "loss": 0.8588, + "mean_token_accuracy": 0.7295355796813965, + "num_tokens": 340614231.0, + "step": 13458 + }, + { + "epoch": 1.4780364594772677, + "grad_norm": 2.247330665588379, + "learning_rate": 1e-06, + "loss": 0.8992, + "mean_token_accuracy": 0.7215136289596558, + "num_tokens": 340638117.0, + "step": 13459 + }, + { + "epoch": 1.4781462771798815, + "grad_norm": 1.8422333002090454, + "learning_rate": 1e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.7048435211181641, + "num_tokens": 340674226.0, + "step": 13460 + }, + { + "epoch": 1.478256094882495, + "grad_norm": 2.1723101139068604, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7013670206069946, + "num_tokens": 340700992.0, + "step": 13461 + }, + { + "epoch": 1.4783659125851087, + "grad_norm": 2.221034288406372, + "learning_rate": 1e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7040761709213257, + "num_tokens": 340727349.0, + "step": 13462 + }, + { + "epoch": 1.4784757302877223, + "grad_norm": 2.08524227142334, + "learning_rate": 1e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.7227123975753784, + "num_tokens": 340755829.0, + "step": 13463 + }, + { + "epoch": 1.478585547990336, + "grad_norm": 2.3396501541137695, + "learning_rate": 1e-06, + "loss": 0.9568, + "mean_token_accuracy": 0.7058560252189636, + "num_tokens": 340780510.0, + "step": 13464 + }, + { + "epoch": 1.4786953656929498, + "grad_norm": 2.5024476051330566, + "learning_rate": 1e-06, + "loss": 0.8316, + "mean_token_accuracy": 0.7352728843688965, + "num_tokens": 340801441.0, + "step": 13465 + }, + { + "epoch": 1.4788051833955633, + "grad_norm": 2.1441259384155273, + "learning_rate": 1e-06, + "loss": 0.9992, + "mean_token_accuracy": 0.6922042965888977, + "num_tokens": 340829458.0, + "step": 13466 + }, + { + "epoch": 1.478915001098177, + "grad_norm": 2.1460394859313965, + "learning_rate": 1e-06, + "loss": 0.8641, + "mean_token_accuracy": 0.7381484508514404, + "num_tokens": 340855850.0, + "step": 13467 + }, + { + "epoch": 1.4790248188007906, + "grad_norm": 2.2997570037841797, + "learning_rate": 1e-06, + "loss": 0.8842, + "mean_token_accuracy": 0.7217445373535156, + "num_tokens": 340880839.0, + "step": 13468 + }, + { + "epoch": 1.4791346365034044, + "grad_norm": 2.192833423614502, + "learning_rate": 1e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7063122987747192, + "num_tokens": 340908198.0, + "step": 13469 + }, + { + "epoch": 1.4792444542060181, + "grad_norm": 2.0899338722229004, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7067490816116333, + "num_tokens": 340940422.0, + "step": 13470 + }, + { + "epoch": 1.4793542719086317, + "grad_norm": 2.1025092601776123, + "learning_rate": 1e-06, + "loss": 0.8535, + "mean_token_accuracy": 0.7293475866317749, + "num_tokens": 340967220.0, + "step": 13471 + }, + { + "epoch": 1.4794640896112452, + "grad_norm": 2.432499408721924, + "learning_rate": 1e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.7058104276657104, + "num_tokens": 340993247.0, + "step": 13472 + }, + { + "epoch": 1.479573907313859, + "grad_norm": 2.444624185562134, + "learning_rate": 1e-06, + "loss": 0.8818, + "mean_token_accuracy": 0.737363338470459, + "num_tokens": 341016650.0, + "step": 13473 + }, + { + "epoch": 1.4796837250164727, + "grad_norm": 2.5002329349517822, + "learning_rate": 1e-06, + "loss": 0.8145, + "mean_token_accuracy": 0.740193784236908, + "num_tokens": 341036870.0, + "step": 13474 + }, + { + "epoch": 1.4797935427190863, + "grad_norm": 2.072331666946411, + "learning_rate": 1e-06, + "loss": 0.9131, + "mean_token_accuracy": 0.7225706577301025, + "num_tokens": 341068779.0, + "step": 13475 + }, + { + "epoch": 1.4799033604217, + "grad_norm": 2.1547696590423584, + "learning_rate": 1e-06, + "loss": 0.8518, + "mean_token_accuracy": 0.7338601350784302, + "num_tokens": 341094305.0, + "step": 13476 + }, + { + "epoch": 1.4800131781243135, + "grad_norm": 2.719024181365967, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7151775360107422, + "num_tokens": 341114479.0, + "step": 13477 + }, + { + "epoch": 1.4801229958269273, + "grad_norm": 2.221064329147339, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7207709550857544, + "num_tokens": 341143329.0, + "step": 13478 + }, + { + "epoch": 1.480232813529541, + "grad_norm": 2.22080659866333, + "learning_rate": 1e-06, + "loss": 0.8896, + "mean_token_accuracy": 0.724533200263977, + "num_tokens": 341168669.0, + "step": 13479 + }, + { + "epoch": 1.4803426312321546, + "grad_norm": 2.5622568130493164, + "learning_rate": 1e-06, + "loss": 0.8306, + "mean_token_accuracy": 0.7388562560081482, + "num_tokens": 341188528.0, + "step": 13480 + }, + { + "epoch": 1.4804524489347684, + "grad_norm": 2.6585254669189453, + "learning_rate": 1e-06, + "loss": 0.8538, + "mean_token_accuracy": 0.731123685836792, + "num_tokens": 341208395.0, + "step": 13481 + }, + { + "epoch": 1.4805622666373819, + "grad_norm": 2.149144172668457, + "learning_rate": 1e-06, + "loss": 1.0039, + "mean_token_accuracy": 0.693753182888031, + "num_tokens": 341238865.0, + "step": 13482 + }, + { + "epoch": 1.4806720843399956, + "grad_norm": 2.5201098918914795, + "learning_rate": 1e-06, + "loss": 0.8826, + "mean_token_accuracy": 0.7232168316841125, + "num_tokens": 341259894.0, + "step": 13483 + }, + { + "epoch": 1.4807819020426094, + "grad_norm": 2.1341023445129395, + "learning_rate": 1e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.7219882607460022, + "num_tokens": 341289745.0, + "step": 13484 + }, + { + "epoch": 1.480891719745223, + "grad_norm": 2.2982676029205322, + "learning_rate": 1e-06, + "loss": 0.8524, + "mean_token_accuracy": 0.7401046752929688, + "num_tokens": 341314231.0, + "step": 13485 + }, + { + "epoch": 1.4810015374478365, + "grad_norm": 2.4671316146850586, + "learning_rate": 1e-06, + "loss": 0.8409, + "mean_token_accuracy": 0.7302994728088379, + "num_tokens": 341336484.0, + "step": 13486 + }, + { + "epoch": 1.4811113551504502, + "grad_norm": 2.2140908241271973, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7128363847732544, + "num_tokens": 341362943.0, + "step": 13487 + }, + { + "epoch": 1.481221172853064, + "grad_norm": 2.3632123470306396, + "learning_rate": 1e-06, + "loss": 0.8833, + "mean_token_accuracy": 0.7302747964859009, + "num_tokens": 341385861.0, + "step": 13488 + }, + { + "epoch": 1.4813309905556775, + "grad_norm": 2.2259926795959473, + "learning_rate": 1e-06, + "loss": 0.867, + "mean_token_accuracy": 0.7273434400558472, + "num_tokens": 341409571.0, + "step": 13489 + }, + { + "epoch": 1.4814408082582913, + "grad_norm": 2.2380282878875732, + "learning_rate": 1e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7186200022697449, + "num_tokens": 341433320.0, + "step": 13490 + }, + { + "epoch": 1.4815506259609048, + "grad_norm": 2.4035484790802, + "learning_rate": 1e-06, + "loss": 0.8675, + "mean_token_accuracy": 0.7258392572402954, + "num_tokens": 341456692.0, + "step": 13491 + }, + { + "epoch": 1.4816604436635186, + "grad_norm": 2.432171583175659, + "learning_rate": 1e-06, + "loss": 0.838, + "mean_token_accuracy": 0.7277774214744568, + "num_tokens": 341478380.0, + "step": 13492 + }, + { + "epoch": 1.4817702613661323, + "grad_norm": 2.2014083862304688, + "learning_rate": 1e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7170069217681885, + "num_tokens": 341504326.0, + "step": 13493 + }, + { + "epoch": 1.4818800790687459, + "grad_norm": 2.76413893699646, + "learning_rate": 1e-06, + "loss": 0.8636, + "mean_token_accuracy": 0.7333285808563232, + "num_tokens": 341523092.0, + "step": 13494 + }, + { + "epoch": 1.4819898967713596, + "grad_norm": 2.4047746658325195, + "learning_rate": 1e-06, + "loss": 0.739, + "mean_token_accuracy": 0.7570653557777405, + "num_tokens": 341542499.0, + "step": 13495 + }, + { + "epoch": 1.4820997144739732, + "grad_norm": 2.1378958225250244, + "learning_rate": 1e-06, + "loss": 0.9025, + "mean_token_accuracy": 0.7169428467750549, + "num_tokens": 341569450.0, + "step": 13496 + }, + { + "epoch": 1.482209532176587, + "grad_norm": 2.173677921295166, + "learning_rate": 1e-06, + "loss": 0.9195, + "mean_token_accuracy": 0.7284079790115356, + "num_tokens": 341596370.0, + "step": 13497 + }, + { + "epoch": 1.4823193498792007, + "grad_norm": 2.336792230606079, + "learning_rate": 1e-06, + "loss": 0.8227, + "mean_token_accuracy": 0.7384775876998901, + "num_tokens": 341619183.0, + "step": 13498 + }, + { + "epoch": 1.4824291675818142, + "grad_norm": 2.400446653366089, + "learning_rate": 1e-06, + "loss": 0.8325, + "mean_token_accuracy": 0.7373498678207397, + "num_tokens": 341642314.0, + "step": 13499 + }, + { + "epoch": 1.4825389852844277, + "grad_norm": 2.172907590866089, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7080380916595459, + "num_tokens": 341669948.0, + "step": 13500 + }, + { + "epoch": 1.4826488029870415, + "grad_norm": 2.345337152481079, + "learning_rate": 1e-06, + "loss": 0.8891, + "mean_token_accuracy": 0.7199360132217407, + "num_tokens": 341693919.0, + "step": 13501 + }, + { + "epoch": 1.4827586206896552, + "grad_norm": 2.158724069595337, + "learning_rate": 1e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7240685820579529, + "num_tokens": 341721644.0, + "step": 13502 + }, + { + "epoch": 1.4828684383922688, + "grad_norm": 2.416411876678467, + "learning_rate": 1e-06, + "loss": 0.9001, + "mean_token_accuracy": 0.7234878540039062, + "num_tokens": 341745365.0, + "step": 13503 + }, + { + "epoch": 1.4829782560948825, + "grad_norm": 2.1495020389556885, + "learning_rate": 1e-06, + "loss": 0.9407, + "mean_token_accuracy": 0.7088595628738403, + "num_tokens": 341774512.0, + "step": 13504 + }, + { + "epoch": 1.483088073797496, + "grad_norm": 2.055070638656616, + "learning_rate": 1e-06, + "loss": 0.9086, + "mean_token_accuracy": 0.7174414396286011, + "num_tokens": 341804288.0, + "step": 13505 + }, + { + "epoch": 1.4831978915001098, + "grad_norm": 2.051056385040283, + "learning_rate": 1e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.7101192474365234, + "num_tokens": 341833431.0, + "step": 13506 + }, + { + "epoch": 1.4833077092027236, + "grad_norm": 2.1239893436431885, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.6970109939575195, + "num_tokens": 341863010.0, + "step": 13507 + }, + { + "epoch": 1.4834175269053371, + "grad_norm": 2.3613340854644775, + "learning_rate": 1e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.7108275890350342, + "num_tokens": 341885552.0, + "step": 13508 + }, + { + "epoch": 1.4835273446079509, + "grad_norm": 2.050114154815674, + "learning_rate": 1e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.7116671204566956, + "num_tokens": 341916067.0, + "step": 13509 + }, + { + "epoch": 1.4836371623105644, + "grad_norm": 2.2389705181121826, + "learning_rate": 1e-06, + "loss": 0.8508, + "mean_token_accuracy": 0.734775960445404, + "num_tokens": 341942216.0, + "step": 13510 + }, + { + "epoch": 1.4837469800131782, + "grad_norm": 2.0146756172180176, + "learning_rate": 1e-06, + "loss": 0.901, + "mean_token_accuracy": 0.7195766568183899, + "num_tokens": 341972255.0, + "step": 13511 + }, + { + "epoch": 1.4838567977157917, + "grad_norm": 2.869786500930786, + "learning_rate": 1e-06, + "loss": 0.8566, + "mean_token_accuracy": 0.7306702136993408, + "num_tokens": 341990780.0, + "step": 13512 + }, + { + "epoch": 1.4839666154184055, + "grad_norm": 2.457943916320801, + "learning_rate": 1e-06, + "loss": 0.867, + "mean_token_accuracy": 0.7322396039962769, + "num_tokens": 342013177.0, + "step": 13513 + }, + { + "epoch": 1.484076433121019, + "grad_norm": 2.2369861602783203, + "learning_rate": 1e-06, + "loss": 0.869, + "mean_token_accuracy": 0.7314959764480591, + "num_tokens": 342039591.0, + "step": 13514 + }, + { + "epoch": 1.4841862508236328, + "grad_norm": 2.2680933475494385, + "learning_rate": 1e-06, + "loss": 0.9047, + "mean_token_accuracy": 0.7176448106765747, + "num_tokens": 342065040.0, + "step": 13515 + }, + { + "epoch": 1.4842960685262465, + "grad_norm": 2.605302572250366, + "learning_rate": 1e-06, + "loss": 0.8807, + "mean_token_accuracy": 0.7326734662055969, + "num_tokens": 342086431.0, + "step": 13516 + }, + { + "epoch": 1.48440588622886, + "grad_norm": 2.545614719390869, + "learning_rate": 1e-06, + "loss": 0.8384, + "mean_token_accuracy": 0.7395911812782288, + "num_tokens": 342107784.0, + "step": 13517 + }, + { + "epoch": 1.4845157039314738, + "grad_norm": 2.383262872695923, + "learning_rate": 1e-06, + "loss": 0.8799, + "mean_token_accuracy": 0.7370411157608032, + "num_tokens": 342130583.0, + "step": 13518 + }, + { + "epoch": 1.4846255216340873, + "grad_norm": 2.6144537925720215, + "learning_rate": 1e-06, + "loss": 0.88, + "mean_token_accuracy": 0.7234708070755005, + "num_tokens": 342154811.0, + "step": 13519 + }, + { + "epoch": 1.484735339336701, + "grad_norm": 2.433774709701538, + "learning_rate": 1e-06, + "loss": 0.8415, + "mean_token_accuracy": 0.7358975410461426, + "num_tokens": 342177558.0, + "step": 13520 + }, + { + "epoch": 1.4848451570393149, + "grad_norm": 2.453587293624878, + "learning_rate": 1e-06, + "loss": 0.8665, + "mean_token_accuracy": 0.7298052310943604, + "num_tokens": 342199421.0, + "step": 13521 + }, + { + "epoch": 1.4849549747419284, + "grad_norm": 2.181169033050537, + "learning_rate": 1e-06, + "loss": 0.8867, + "mean_token_accuracy": 0.7240592837333679, + "num_tokens": 342226006.0, + "step": 13522 + }, + { + "epoch": 1.485064792444542, + "grad_norm": 2.1130030155181885, + "learning_rate": 1e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.7139961123466492, + "num_tokens": 342253917.0, + "step": 13523 + }, + { + "epoch": 1.4851746101471557, + "grad_norm": 2.099520444869995, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7135772705078125, + "num_tokens": 342280598.0, + "step": 13524 + }, + { + "epoch": 1.4852844278497694, + "grad_norm": 2.3001537322998047, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7010402679443359, + "num_tokens": 342308265.0, + "step": 13525 + }, + { + "epoch": 1.485394245552383, + "grad_norm": 2.3444650173187256, + "learning_rate": 1e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.710100531578064, + "num_tokens": 342331713.0, + "step": 13526 + }, + { + "epoch": 1.4855040632549967, + "grad_norm": 2.3310489654541016, + "learning_rate": 1e-06, + "loss": 0.8558, + "mean_token_accuracy": 0.7297073602676392, + "num_tokens": 342354766.0, + "step": 13527 + }, + { + "epoch": 1.4856138809576103, + "grad_norm": 2.2961254119873047, + "learning_rate": 1e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.7047391533851624, + "num_tokens": 342380704.0, + "step": 13528 + }, + { + "epoch": 1.485723698660224, + "grad_norm": 2.2942392826080322, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7139202356338501, + "num_tokens": 342406910.0, + "step": 13529 + }, + { + "epoch": 1.4858335163628378, + "grad_norm": 2.336772918701172, + "learning_rate": 1e-06, + "loss": 0.9672, + "mean_token_accuracy": 0.7064881920814514, + "num_tokens": 342431939.0, + "step": 13530 + }, + { + "epoch": 1.4859433340654513, + "grad_norm": 2.3590173721313477, + "learning_rate": 1e-06, + "loss": 0.8972, + "mean_token_accuracy": 0.7207505106925964, + "num_tokens": 342455637.0, + "step": 13531 + }, + { + "epoch": 1.486053151768065, + "grad_norm": 2.4426634311676025, + "learning_rate": 1e-06, + "loss": 0.8813, + "mean_token_accuracy": 0.7235100865364075, + "num_tokens": 342477000.0, + "step": 13532 + }, + { + "epoch": 1.4861629694706786, + "grad_norm": 2.426389455795288, + "learning_rate": 1e-06, + "loss": 0.93, + "mean_token_accuracy": 0.7107546329498291, + "num_tokens": 342499433.0, + "step": 13533 + }, + { + "epoch": 1.4862727871732924, + "grad_norm": 2.1563684940338135, + "learning_rate": 1e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.7187756299972534, + "num_tokens": 342529144.0, + "step": 13534 + }, + { + "epoch": 1.4863826048759061, + "grad_norm": 2.342026948928833, + "learning_rate": 1e-06, + "loss": 0.8311, + "mean_token_accuracy": 0.737909197807312, + "num_tokens": 342553623.0, + "step": 13535 + }, + { + "epoch": 1.4864924225785197, + "grad_norm": 2.224017381668091, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7044267654418945, + "num_tokens": 342582386.0, + "step": 13536 + }, + { + "epoch": 1.4866022402811332, + "grad_norm": 2.240408182144165, + "learning_rate": 1e-06, + "loss": 0.8475, + "mean_token_accuracy": 0.7335952520370483, + "num_tokens": 342605208.0, + "step": 13537 + }, + { + "epoch": 1.486712057983747, + "grad_norm": 2.2304232120513916, + "learning_rate": 1e-06, + "loss": 0.7774, + "mean_token_accuracy": 0.7610750198364258, + "num_tokens": 342630862.0, + "step": 13538 + }, + { + "epoch": 1.4868218756863607, + "grad_norm": 2.1780214309692383, + "learning_rate": 1e-06, + "loss": 0.9242, + "mean_token_accuracy": 0.7221741676330566, + "num_tokens": 342660154.0, + "step": 13539 + }, + { + "epoch": 1.4869316933889742, + "grad_norm": 2.0772855281829834, + "learning_rate": 1e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.7050976753234863, + "num_tokens": 342691579.0, + "step": 13540 + }, + { + "epoch": 1.487041511091588, + "grad_norm": 2.1096808910369873, + "learning_rate": 1e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.6943528652191162, + "num_tokens": 342720925.0, + "step": 13541 + }, + { + "epoch": 1.4871513287942015, + "grad_norm": 2.2450625896453857, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.7164174318313599, + "num_tokens": 342744921.0, + "step": 13542 + }, + { + "epoch": 1.4872611464968153, + "grad_norm": 1.9626175165176392, + "learning_rate": 1e-06, + "loss": 0.9668, + "mean_token_accuracy": 0.7157995104789734, + "num_tokens": 342777667.0, + "step": 13543 + }, + { + "epoch": 1.487370964199429, + "grad_norm": 2.5048727989196777, + "learning_rate": 1e-06, + "loss": 0.7983, + "mean_token_accuracy": 0.751234769821167, + "num_tokens": 342798149.0, + "step": 13544 + }, + { + "epoch": 1.4874807819020426, + "grad_norm": 2.435089349746704, + "learning_rate": 1e-06, + "loss": 0.7529, + "mean_token_accuracy": 0.7548434138298035, + "num_tokens": 342819816.0, + "step": 13545 + }, + { + "epoch": 1.4875905996046563, + "grad_norm": 2.0509731769561768, + "learning_rate": 1e-06, + "loss": 0.911, + "mean_token_accuracy": 0.714078426361084, + "num_tokens": 342849257.0, + "step": 13546 + }, + { + "epoch": 1.4877004173072699, + "grad_norm": 2.2364723682403564, + "learning_rate": 1e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.7142061591148376, + "num_tokens": 342876535.0, + "step": 13547 + }, + { + "epoch": 1.4878102350098836, + "grad_norm": 2.4987804889678955, + "learning_rate": 1e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.7174086570739746, + "num_tokens": 342897588.0, + "step": 13548 + }, + { + "epoch": 1.4879200527124974, + "grad_norm": 2.325448751449585, + "learning_rate": 1e-06, + "loss": 0.9457, + "mean_token_accuracy": 0.7102993130683899, + "num_tokens": 342922999.0, + "step": 13549 + }, + { + "epoch": 1.488029870415111, + "grad_norm": 2.4430794715881348, + "learning_rate": 1e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7250494956970215, + "num_tokens": 342945301.0, + "step": 13550 + }, + { + "epoch": 1.4881396881177245, + "grad_norm": 2.306053876876831, + "learning_rate": 1e-06, + "loss": 0.8776, + "mean_token_accuracy": 0.725055456161499, + "num_tokens": 342968822.0, + "step": 13551 + }, + { + "epoch": 1.4882495058203382, + "grad_norm": 2.2215378284454346, + "learning_rate": 1e-06, + "loss": 0.892, + "mean_token_accuracy": 0.717634916305542, + "num_tokens": 342992904.0, + "step": 13552 + }, + { + "epoch": 1.488359323522952, + "grad_norm": 2.268578052520752, + "learning_rate": 1e-06, + "loss": 0.8343, + "mean_token_accuracy": 0.736445426940918, + "num_tokens": 343018494.0, + "step": 13553 + }, + { + "epoch": 1.4884691412255655, + "grad_norm": 2.4712703227996826, + "learning_rate": 1e-06, + "loss": 0.8665, + "mean_token_accuracy": 0.725214958190918, + "num_tokens": 343040091.0, + "step": 13554 + }, + { + "epoch": 1.4885789589281793, + "grad_norm": 2.4198343753814697, + "learning_rate": 1e-06, + "loss": 0.8712, + "mean_token_accuracy": 0.722834587097168, + "num_tokens": 343064849.0, + "step": 13555 + }, + { + "epoch": 1.4886887766307928, + "grad_norm": 2.339292526245117, + "learning_rate": 1e-06, + "loss": 0.8763, + "mean_token_accuracy": 0.7200270891189575, + "num_tokens": 343089766.0, + "step": 13556 + }, + { + "epoch": 1.4887985943334066, + "grad_norm": 1.9987313747406006, + "learning_rate": 1e-06, + "loss": 0.922, + "mean_token_accuracy": 0.7103807926177979, + "num_tokens": 343121847.0, + "step": 13557 + }, + { + "epoch": 1.4889084120360203, + "grad_norm": 2.2995779514312744, + "learning_rate": 1e-06, + "loss": 0.9055, + "mean_token_accuracy": 0.7152614593505859, + "num_tokens": 343145811.0, + "step": 13558 + }, + { + "epoch": 1.4890182297386338, + "grad_norm": 2.3533926010131836, + "learning_rate": 1e-06, + "loss": 0.8833, + "mean_token_accuracy": 0.7233214378356934, + "num_tokens": 343169881.0, + "step": 13559 + }, + { + "epoch": 1.4891280474412476, + "grad_norm": 2.3822383880615234, + "learning_rate": 1e-06, + "loss": 0.9036, + "mean_token_accuracy": 0.7177595496177673, + "num_tokens": 343193257.0, + "step": 13560 + }, + { + "epoch": 1.4892378651438611, + "grad_norm": 2.271116018295288, + "learning_rate": 1e-06, + "loss": 1.0098, + "mean_token_accuracy": 0.6909720301628113, + "num_tokens": 343220198.0, + "step": 13561 + }, + { + "epoch": 1.489347682846475, + "grad_norm": 2.171912670135498, + "learning_rate": 1e-06, + "loss": 0.9541, + "mean_token_accuracy": 0.7071256041526794, + "num_tokens": 343246859.0, + "step": 13562 + }, + { + "epoch": 1.4894575005490884, + "grad_norm": 2.4578616619110107, + "learning_rate": 1e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.7051253914833069, + "num_tokens": 343271084.0, + "step": 13563 + }, + { + "epoch": 1.4895673182517022, + "grad_norm": 2.562509775161743, + "learning_rate": 1e-06, + "loss": 0.8784, + "mean_token_accuracy": 0.7202976942062378, + "num_tokens": 343293539.0, + "step": 13564 + }, + { + "epoch": 1.4896771359543157, + "grad_norm": 2.1508731842041016, + "learning_rate": 1e-06, + "loss": 0.9588, + "mean_token_accuracy": 0.6972389817237854, + "num_tokens": 343321841.0, + "step": 13565 + }, + { + "epoch": 1.4897869536569295, + "grad_norm": 2.498164653778076, + "learning_rate": 1e-06, + "loss": 0.8815, + "mean_token_accuracy": 0.7282219529151917, + "num_tokens": 343343506.0, + "step": 13566 + }, + { + "epoch": 1.4898967713595432, + "grad_norm": 2.2576687335968018, + "learning_rate": 1e-06, + "loss": 0.9011, + "mean_token_accuracy": 0.7243117094039917, + "num_tokens": 343368402.0, + "step": 13567 + }, + { + "epoch": 1.4900065890621568, + "grad_norm": 2.0381062030792236, + "learning_rate": 1e-06, + "loss": 0.847, + "mean_token_accuracy": 0.7301470041275024, + "num_tokens": 343399816.0, + "step": 13568 + }, + { + "epoch": 1.4901164067647705, + "grad_norm": 2.047675848007202, + "learning_rate": 1e-06, + "loss": 0.8995, + "mean_token_accuracy": 0.7194361686706543, + "num_tokens": 343430142.0, + "step": 13569 + }, + { + "epoch": 1.490226224467384, + "grad_norm": 2.279693126678467, + "learning_rate": 1e-06, + "loss": 0.8938, + "mean_token_accuracy": 0.7207046747207642, + "num_tokens": 343455930.0, + "step": 13570 + }, + { + "epoch": 1.4903360421699978, + "grad_norm": 2.234884262084961, + "learning_rate": 1e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.7210765480995178, + "num_tokens": 343481988.0, + "step": 13571 + }, + { + "epoch": 1.4904458598726116, + "grad_norm": 2.6626908779144287, + "learning_rate": 1e-06, + "loss": 0.7544, + "mean_token_accuracy": 0.7542787790298462, + "num_tokens": 343500880.0, + "step": 13572 + }, + { + "epoch": 1.4905556775752251, + "grad_norm": 1.9438515901565552, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7221987247467041, + "num_tokens": 343532192.0, + "step": 13573 + }, + { + "epoch": 1.4906654952778389, + "grad_norm": 2.36812424659729, + "learning_rate": 1e-06, + "loss": 0.9202, + "mean_token_accuracy": 0.713367223739624, + "num_tokens": 343557350.0, + "step": 13574 + }, + { + "epoch": 1.4907753129804524, + "grad_norm": 2.641354560852051, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7103385329246521, + "num_tokens": 343576833.0, + "step": 13575 + }, + { + "epoch": 1.4908851306830662, + "grad_norm": 2.246410369873047, + "learning_rate": 1e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.714317798614502, + "num_tokens": 343601533.0, + "step": 13576 + }, + { + "epoch": 1.4909949483856797, + "grad_norm": 2.1675169467926025, + "learning_rate": 1e-06, + "loss": 0.9708, + "mean_token_accuracy": 0.7046728134155273, + "num_tokens": 343629649.0, + "step": 13577 + }, + { + "epoch": 1.4911047660882935, + "grad_norm": 2.0453522205352783, + "learning_rate": 1e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7090947031974792, + "num_tokens": 343662154.0, + "step": 13578 + }, + { + "epoch": 1.491214583790907, + "grad_norm": 2.0501043796539307, + "learning_rate": 1e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.7212267518043518, + "num_tokens": 343691782.0, + "step": 13579 + }, + { + "epoch": 1.4913244014935207, + "grad_norm": 2.3355228900909424, + "learning_rate": 1e-06, + "loss": 0.9603, + "mean_token_accuracy": 0.6994415521621704, + "num_tokens": 343715342.0, + "step": 13580 + }, + { + "epoch": 1.4914342191961345, + "grad_norm": 2.57863712310791, + "learning_rate": 1e-06, + "loss": 0.7738, + "mean_token_accuracy": 0.7504251003265381, + "num_tokens": 343734303.0, + "step": 13581 + }, + { + "epoch": 1.491544036898748, + "grad_norm": 2.5238916873931885, + "learning_rate": 1e-06, + "loss": 0.8132, + "mean_token_accuracy": 0.7432900667190552, + "num_tokens": 343756833.0, + "step": 13582 + }, + { + "epoch": 1.4916538546013618, + "grad_norm": 2.2888920307159424, + "learning_rate": 1e-06, + "loss": 0.8917, + "mean_token_accuracy": 0.7214633226394653, + "num_tokens": 343781188.0, + "step": 13583 + }, + { + "epoch": 1.4917636723039753, + "grad_norm": 2.4157817363739014, + "learning_rate": 1e-06, + "loss": 0.8302, + "mean_token_accuracy": 0.7409824132919312, + "num_tokens": 343804081.0, + "step": 13584 + }, + { + "epoch": 1.491873490006589, + "grad_norm": 2.253882884979248, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.6929596662521362, + "num_tokens": 343830499.0, + "step": 13585 + }, + { + "epoch": 1.4919833077092028, + "grad_norm": 2.3188071250915527, + "learning_rate": 1e-06, + "loss": 0.8313, + "mean_token_accuracy": 0.7405022978782654, + "num_tokens": 343854871.0, + "step": 13586 + }, + { + "epoch": 1.4920931254118164, + "grad_norm": 2.4388444423675537, + "learning_rate": 1e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7216691374778748, + "num_tokens": 343878360.0, + "step": 13587 + }, + { + "epoch": 1.49220294311443, + "grad_norm": 2.3882036209106445, + "learning_rate": 1e-06, + "loss": 0.8932, + "mean_token_accuracy": 0.7230492830276489, + "num_tokens": 343902440.0, + "step": 13588 + }, + { + "epoch": 1.4923127608170437, + "grad_norm": 2.2438902854919434, + "learning_rate": 1e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.7093905210494995, + "num_tokens": 343929227.0, + "step": 13589 + }, + { + "epoch": 1.4924225785196574, + "grad_norm": 2.280550241470337, + "learning_rate": 1e-06, + "loss": 0.8749, + "mean_token_accuracy": 0.7253513336181641, + "num_tokens": 343954606.0, + "step": 13590 + }, + { + "epoch": 1.492532396222271, + "grad_norm": 2.0547025203704834, + "learning_rate": 1e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.7243292331695557, + "num_tokens": 343983243.0, + "step": 13591 + }, + { + "epoch": 1.4926422139248847, + "grad_norm": 2.4020421504974365, + "learning_rate": 1e-06, + "loss": 0.8244, + "mean_token_accuracy": 0.743437647819519, + "num_tokens": 344005787.0, + "step": 13592 + }, + { + "epoch": 1.4927520316274983, + "grad_norm": 2.510342836380005, + "learning_rate": 1e-06, + "loss": 0.862, + "mean_token_accuracy": 0.7325657606124878, + "num_tokens": 344027023.0, + "step": 13593 + }, + { + "epoch": 1.492861849330112, + "grad_norm": 2.1090362071990967, + "learning_rate": 1e-06, + "loss": 0.7749, + "mean_token_accuracy": 0.7537823915481567, + "num_tokens": 344052482.0, + "step": 13594 + }, + { + "epoch": 1.4929716670327258, + "grad_norm": 2.334777593612671, + "learning_rate": 1e-06, + "loss": 0.873, + "mean_token_accuracy": 0.7252440452575684, + "num_tokens": 344077917.0, + "step": 13595 + }, + { + "epoch": 1.4930814847353393, + "grad_norm": 2.338034152984619, + "learning_rate": 1e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.7149851322174072, + "num_tokens": 344103135.0, + "step": 13596 + }, + { + "epoch": 1.493191302437953, + "grad_norm": 2.401125431060791, + "learning_rate": 1e-06, + "loss": 0.8356, + "mean_token_accuracy": 0.7404450178146362, + "num_tokens": 344125172.0, + "step": 13597 + }, + { + "epoch": 1.4933011201405666, + "grad_norm": 2.31803822517395, + "learning_rate": 1e-06, + "loss": 0.8179, + "mean_token_accuracy": 0.7411530017852783, + "num_tokens": 344148874.0, + "step": 13598 + }, + { + "epoch": 1.4934109378431804, + "grad_norm": 2.220369338989258, + "learning_rate": 1e-06, + "loss": 0.948, + "mean_token_accuracy": 0.7064087986946106, + "num_tokens": 344177559.0, + "step": 13599 + }, + { + "epoch": 1.493520755545794, + "grad_norm": 2.1535205841064453, + "learning_rate": 1e-06, + "loss": 1.0135, + "mean_token_accuracy": 0.6927524209022522, + "num_tokens": 344205953.0, + "step": 13600 + }, + { + "epoch": 1.4936305732484076, + "grad_norm": 2.637666940689087, + "learning_rate": 1e-06, + "loss": 0.8756, + "mean_token_accuracy": 0.7273800373077393, + "num_tokens": 344225962.0, + "step": 13601 + }, + { + "epoch": 1.4937403909510212, + "grad_norm": 2.102107524871826, + "learning_rate": 1e-06, + "loss": 0.8456, + "mean_token_accuracy": 0.7375072240829468, + "num_tokens": 344253407.0, + "step": 13602 + }, + { + "epoch": 1.493850208653635, + "grad_norm": 2.1065900325775146, + "learning_rate": 1e-06, + "loss": 0.9612, + "mean_token_accuracy": 0.710177481174469, + "num_tokens": 344284164.0, + "step": 13603 + }, + { + "epoch": 1.4939600263562487, + "grad_norm": 2.448789119720459, + "learning_rate": 1e-06, + "loss": 0.8364, + "mean_token_accuracy": 0.7338591814041138, + "num_tokens": 344304280.0, + "step": 13604 + }, + { + "epoch": 1.4940698440588622, + "grad_norm": 2.1067919731140137, + "learning_rate": 1e-06, + "loss": 0.8883, + "mean_token_accuracy": 0.7199451923370361, + "num_tokens": 344332193.0, + "step": 13605 + }, + { + "epoch": 1.494179661761476, + "grad_norm": 2.224757432937622, + "learning_rate": 1e-06, + "loss": 0.788, + "mean_token_accuracy": 0.7573881149291992, + "num_tokens": 344356766.0, + "step": 13606 + }, + { + "epoch": 1.4942894794640895, + "grad_norm": 2.2644898891448975, + "learning_rate": 1e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.7309381365776062, + "num_tokens": 344381040.0, + "step": 13607 + }, + { + "epoch": 1.4943992971667033, + "grad_norm": 2.260281801223755, + "learning_rate": 1e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.7197622060775757, + "num_tokens": 344408409.0, + "step": 13608 + }, + { + "epoch": 1.494509114869317, + "grad_norm": 2.2612016201019287, + "learning_rate": 1e-06, + "loss": 0.8756, + "mean_token_accuracy": 0.725458562374115, + "num_tokens": 344435882.0, + "step": 13609 + }, + { + "epoch": 1.4946189325719306, + "grad_norm": 2.2923221588134766, + "learning_rate": 1e-06, + "loss": 0.9256, + "mean_token_accuracy": 0.7145028710365295, + "num_tokens": 344460812.0, + "step": 13610 + }, + { + "epoch": 1.4947287502745443, + "grad_norm": 2.4884519577026367, + "learning_rate": 1e-06, + "loss": 0.949, + "mean_token_accuracy": 0.713116466999054, + "num_tokens": 344481308.0, + "step": 13611 + }, + { + "epoch": 1.4948385679771579, + "grad_norm": 2.0888404846191406, + "learning_rate": 1e-06, + "loss": 1.0215, + "mean_token_accuracy": 0.6920777559280396, + "num_tokens": 344510834.0, + "step": 13612 + }, + { + "epoch": 1.4949483856797716, + "grad_norm": 2.1053009033203125, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7063181400299072, + "num_tokens": 344541014.0, + "step": 13613 + }, + { + "epoch": 1.4950582033823854, + "grad_norm": 2.2554991245269775, + "learning_rate": 1e-06, + "loss": 0.9224, + "mean_token_accuracy": 0.7187625169754028, + "num_tokens": 344565904.0, + "step": 13614 + }, + { + "epoch": 1.495168021084999, + "grad_norm": 2.1711695194244385, + "learning_rate": 1e-06, + "loss": 1.0124, + "mean_token_accuracy": 0.6975061893463135, + "num_tokens": 344594023.0, + "step": 13615 + }, + { + "epoch": 1.4952778387876124, + "grad_norm": 2.4811618328094482, + "learning_rate": 1e-06, + "loss": 0.8583, + "mean_token_accuracy": 0.7327876091003418, + "num_tokens": 344616458.0, + "step": 13616 + }, + { + "epoch": 1.4953876564902262, + "grad_norm": 2.2903518676757812, + "learning_rate": 1e-06, + "loss": 0.972, + "mean_token_accuracy": 0.7019496560096741, + "num_tokens": 344641823.0, + "step": 13617 + }, + { + "epoch": 1.49549747419284, + "grad_norm": 2.333575487136841, + "learning_rate": 1e-06, + "loss": 0.8285, + "mean_token_accuracy": 0.7409729957580566, + "num_tokens": 344665985.0, + "step": 13618 + }, + { + "epoch": 1.4956072918954535, + "grad_norm": 2.103872537612915, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7040597200393677, + "num_tokens": 344694526.0, + "step": 13619 + }, + { + "epoch": 1.4957171095980673, + "grad_norm": 2.280160427093506, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.7005042433738708, + "num_tokens": 344721335.0, + "step": 13620 + }, + { + "epoch": 1.4958269273006808, + "grad_norm": 2.4854423999786377, + "learning_rate": 1e-06, + "loss": 0.9062, + "mean_token_accuracy": 0.718154788017273, + "num_tokens": 344745801.0, + "step": 13621 + }, + { + "epoch": 1.4959367450032945, + "grad_norm": 2.3327789306640625, + "learning_rate": 1e-06, + "loss": 0.8453, + "mean_token_accuracy": 0.7344368696212769, + "num_tokens": 344770120.0, + "step": 13622 + }, + { + "epoch": 1.4960465627059083, + "grad_norm": 2.1187374591827393, + "learning_rate": 1e-06, + "loss": 0.8634, + "mean_token_accuracy": 0.7253828644752502, + "num_tokens": 344797893.0, + "step": 13623 + }, + { + "epoch": 1.4961563804085218, + "grad_norm": 2.328657865524292, + "learning_rate": 1e-06, + "loss": 0.8107, + "mean_token_accuracy": 0.7432137727737427, + "num_tokens": 344821857.0, + "step": 13624 + }, + { + "epoch": 1.4962661981111356, + "grad_norm": 2.0400915145874023, + "learning_rate": 1e-06, + "loss": 1.0257, + "mean_token_accuracy": 0.685849666595459, + "num_tokens": 344854360.0, + "step": 13625 + }, + { + "epoch": 1.4963760158137491, + "grad_norm": 2.0657174587249756, + "learning_rate": 1e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.7134199738502502, + "num_tokens": 344883994.0, + "step": 13626 + }, + { + "epoch": 1.4964858335163629, + "grad_norm": 2.1023294925689697, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.7119417786598206, + "num_tokens": 344914653.0, + "step": 13627 + }, + { + "epoch": 1.4965956512189764, + "grad_norm": 2.4681413173675537, + "learning_rate": 1e-06, + "loss": 0.8683, + "mean_token_accuracy": 0.7275854349136353, + "num_tokens": 344936851.0, + "step": 13628 + }, + { + "epoch": 1.4967054689215902, + "grad_norm": 2.5775609016418457, + "learning_rate": 1e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.7231738567352295, + "num_tokens": 344958106.0, + "step": 13629 + }, + { + "epoch": 1.4968152866242037, + "grad_norm": 2.8196933269500732, + "learning_rate": 1e-06, + "loss": 0.843, + "mean_token_accuracy": 0.7282869815826416, + "num_tokens": 344975972.0, + "step": 13630 + }, + { + "epoch": 1.4969251043268175, + "grad_norm": 2.1253762245178223, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.6851077675819397, + "num_tokens": 345003396.0, + "step": 13631 + }, + { + "epoch": 1.4970349220294312, + "grad_norm": 2.344255208969116, + "learning_rate": 1e-06, + "loss": 0.7519, + "mean_token_accuracy": 0.7557333707809448, + "num_tokens": 345024666.0, + "step": 13632 + }, + { + "epoch": 1.4971447397320448, + "grad_norm": 2.552565574645996, + "learning_rate": 1e-06, + "loss": 0.8932, + "mean_token_accuracy": 0.719841480255127, + "num_tokens": 345046563.0, + "step": 13633 + }, + { + "epoch": 1.4972545574346585, + "grad_norm": 2.774308443069458, + "learning_rate": 1e-06, + "loss": 0.7841, + "mean_token_accuracy": 0.7442796230316162, + "num_tokens": 345064786.0, + "step": 13634 + }, + { + "epoch": 1.497364375137272, + "grad_norm": 2.1854519844055176, + "learning_rate": 1e-06, + "loss": 0.8876, + "mean_token_accuracy": 0.7266706824302673, + "num_tokens": 345092821.0, + "step": 13635 + }, + { + "epoch": 1.4974741928398858, + "grad_norm": 2.350839138031006, + "learning_rate": 1e-06, + "loss": 0.8533, + "mean_token_accuracy": 0.7354361414909363, + "num_tokens": 345115632.0, + "step": 13636 + }, + { + "epoch": 1.4975840105424996, + "grad_norm": 2.4269988536834717, + "learning_rate": 1e-06, + "loss": 0.8891, + "mean_token_accuracy": 0.7258508205413818, + "num_tokens": 345138297.0, + "step": 13637 + }, + { + "epoch": 1.497693828245113, + "grad_norm": 2.098334312438965, + "learning_rate": 1e-06, + "loss": 0.9048, + "mean_token_accuracy": 0.7184019684791565, + "num_tokens": 345168214.0, + "step": 13638 + }, + { + "epoch": 1.4978036459477266, + "grad_norm": 2.112055778503418, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.6984393000602722, + "num_tokens": 345197268.0, + "step": 13639 + }, + { + "epoch": 1.4979134636503404, + "grad_norm": 2.2157111167907715, + "learning_rate": 1e-06, + "loss": 0.8513, + "mean_token_accuracy": 0.738407552242279, + "num_tokens": 345224693.0, + "step": 13640 + }, + { + "epoch": 1.4980232813529542, + "grad_norm": 1.9637998342514038, + "learning_rate": 1e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7111068964004517, + "num_tokens": 345258491.0, + "step": 13641 + }, + { + "epoch": 1.4981330990555677, + "grad_norm": 2.7171390056610107, + "learning_rate": 1e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.7147290706634521, + "num_tokens": 345277782.0, + "step": 13642 + }, + { + "epoch": 1.4982429167581814, + "grad_norm": 2.6828255653381348, + "learning_rate": 1e-06, + "loss": 0.831, + "mean_token_accuracy": 0.7379277348518372, + "num_tokens": 345297081.0, + "step": 13643 + }, + { + "epoch": 1.498352734460795, + "grad_norm": 2.314549446105957, + "learning_rate": 1e-06, + "loss": 0.881, + "mean_token_accuracy": 0.7186172008514404, + "num_tokens": 345319581.0, + "step": 13644 + }, + { + "epoch": 1.4984625521634087, + "grad_norm": 2.1261425018310547, + "learning_rate": 1e-06, + "loss": 0.8536, + "mean_token_accuracy": 0.7306444048881531, + "num_tokens": 345346567.0, + "step": 13645 + }, + { + "epoch": 1.4985723698660225, + "grad_norm": 2.5213053226470947, + "learning_rate": 1e-06, + "loss": 0.824, + "mean_token_accuracy": 0.7374302744865417, + "num_tokens": 345365908.0, + "step": 13646 + }, + { + "epoch": 1.498682187568636, + "grad_norm": 2.196382522583008, + "learning_rate": 1e-06, + "loss": 0.8255, + "mean_token_accuracy": 0.7345930933952332, + "num_tokens": 345390597.0, + "step": 13647 + }, + { + "epoch": 1.4987920052712498, + "grad_norm": 2.2553770542144775, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7148809432983398, + "num_tokens": 345417828.0, + "step": 13648 + }, + { + "epoch": 1.4989018229738633, + "grad_norm": 2.763367176055908, + "learning_rate": 1e-06, + "loss": 0.89, + "mean_token_accuracy": 0.7171241044998169, + "num_tokens": 345435651.0, + "step": 13649 + }, + { + "epoch": 1.499011640676477, + "grad_norm": 2.5078632831573486, + "learning_rate": 1e-06, + "loss": 0.8915, + "mean_token_accuracy": 0.7243342399597168, + "num_tokens": 345456110.0, + "step": 13650 + }, + { + "epoch": 1.4991214583790908, + "grad_norm": 2.135408401489258, + "learning_rate": 1e-06, + "loss": 0.8975, + "mean_token_accuracy": 0.7234749794006348, + "num_tokens": 345483129.0, + "step": 13651 + }, + { + "epoch": 1.4992312760817044, + "grad_norm": 2.3892269134521484, + "learning_rate": 1e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.7128754258155823, + "num_tokens": 345509021.0, + "step": 13652 + }, + { + "epoch": 1.499341093784318, + "grad_norm": 2.5613036155700684, + "learning_rate": 1e-06, + "loss": 0.8839, + "mean_token_accuracy": 0.7196683883666992, + "num_tokens": 345532432.0, + "step": 13653 + }, + { + "epoch": 1.4994509114869317, + "grad_norm": 2.041198968887329, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7126452922821045, + "num_tokens": 345560230.0, + "step": 13654 + }, + { + "epoch": 1.4995607291895454, + "grad_norm": 2.4485812187194824, + "learning_rate": 1e-06, + "loss": 0.8899, + "mean_token_accuracy": 0.7287323474884033, + "num_tokens": 345581952.0, + "step": 13655 + }, + { + "epoch": 1.499670546892159, + "grad_norm": 2.3576149940490723, + "learning_rate": 1e-06, + "loss": 0.881, + "mean_token_accuracy": 0.7290439605712891, + "num_tokens": 345605681.0, + "step": 13656 + }, + { + "epoch": 1.4997803645947727, + "grad_norm": 2.227424144744873, + "learning_rate": 1e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.7178409099578857, + "num_tokens": 345631047.0, + "step": 13657 + }, + { + "epoch": 1.4998901822973862, + "grad_norm": 2.282428741455078, + "learning_rate": 1e-06, + "loss": 0.9086, + "mean_token_accuracy": 0.722820520401001, + "num_tokens": 345655541.0, + "step": 13658 + }, + { + "epoch": 1.5, + "grad_norm": 2.3251943588256836, + "learning_rate": 1e-06, + "loss": 0.8882, + "mean_token_accuracy": 0.7176491618156433, + "num_tokens": 345680604.0, + "step": 13659 + }, + { + "epoch": 1.5001098177026138, + "grad_norm": 2.397883892059326, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.7067511081695557, + "num_tokens": 345704179.0, + "step": 13660 + }, + { + "epoch": 1.5002196354052273, + "grad_norm": 2.5882742404937744, + "learning_rate": 1e-06, + "loss": 0.89, + "mean_token_accuracy": 0.7328507900238037, + "num_tokens": 345724652.0, + "step": 13661 + }, + { + "epoch": 1.5003294531078408, + "grad_norm": 1.789685845375061, + "learning_rate": 1e-06, + "loss": 0.8896, + "mean_token_accuracy": 0.7265164852142334, + "num_tokens": 345760446.0, + "step": 13662 + }, + { + "epoch": 1.5004392708104546, + "grad_norm": 2.235647678375244, + "learning_rate": 1e-06, + "loss": 0.8828, + "mean_token_accuracy": 0.7251597046852112, + "num_tokens": 345784945.0, + "step": 13663 + }, + { + "epoch": 1.5005490885130683, + "grad_norm": 2.090174436569214, + "learning_rate": 1e-06, + "loss": 0.9229, + "mean_token_accuracy": 0.7129610776901245, + "num_tokens": 345813054.0, + "step": 13664 + }, + { + "epoch": 1.500658906215682, + "grad_norm": 2.2671542167663574, + "learning_rate": 1e-06, + "loss": 0.9317, + "mean_token_accuracy": 0.7130299806594849, + "num_tokens": 345839742.0, + "step": 13665 + }, + { + "epoch": 1.5007687239182956, + "grad_norm": 2.093935489654541, + "learning_rate": 1e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.712653398513794, + "num_tokens": 345869699.0, + "step": 13666 + }, + { + "epoch": 1.5008785416209092, + "grad_norm": 2.2834959030151367, + "learning_rate": 1e-06, + "loss": 0.982, + "mean_token_accuracy": 0.6967060565948486, + "num_tokens": 345897084.0, + "step": 13667 + }, + { + "epoch": 1.500988359323523, + "grad_norm": 1.9744198322296143, + "learning_rate": 1e-06, + "loss": 0.946, + "mean_token_accuracy": 0.7117725610733032, + "num_tokens": 345929838.0, + "step": 13668 + }, + { + "epoch": 1.5010981770261367, + "grad_norm": 2.5116255283355713, + "learning_rate": 1e-06, + "loss": 0.8625, + "mean_token_accuracy": 0.72614985704422, + "num_tokens": 345950170.0, + "step": 13669 + }, + { + "epoch": 1.5012079947287504, + "grad_norm": 2.288461208343506, + "learning_rate": 1e-06, + "loss": 0.7894, + "mean_token_accuracy": 0.7480881214141846, + "num_tokens": 345974694.0, + "step": 13670 + }, + { + "epoch": 1.501317812431364, + "grad_norm": 2.502892255783081, + "learning_rate": 1e-06, + "loss": 0.8517, + "mean_token_accuracy": 0.7317295074462891, + "num_tokens": 345995326.0, + "step": 13671 + }, + { + "epoch": 1.5014276301339775, + "grad_norm": 2.319497585296631, + "learning_rate": 1e-06, + "loss": 0.8752, + "mean_token_accuracy": 0.7272747755050659, + "num_tokens": 346021641.0, + "step": 13672 + }, + { + "epoch": 1.5015374478365913, + "grad_norm": 2.463181495666504, + "learning_rate": 1e-06, + "loss": 0.9506, + "mean_token_accuracy": 0.7061572670936584, + "num_tokens": 346045331.0, + "step": 13673 + }, + { + "epoch": 1.501647265539205, + "grad_norm": 2.1263809204101562, + "learning_rate": 1e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7205994129180908, + "num_tokens": 346073224.0, + "step": 13674 + }, + { + "epoch": 1.5017570832418186, + "grad_norm": 2.785762310028076, + "learning_rate": 1e-06, + "loss": 0.8098, + "mean_token_accuracy": 0.7435193061828613, + "num_tokens": 346090594.0, + "step": 13675 + }, + { + "epoch": 1.501866900944432, + "grad_norm": 2.18467116355896, + "learning_rate": 1e-06, + "loss": 0.9058, + "mean_token_accuracy": 0.72307288646698, + "num_tokens": 346116054.0, + "step": 13676 + }, + { + "epoch": 1.5019767186470458, + "grad_norm": 2.435870409011841, + "learning_rate": 1e-06, + "loss": 0.8835, + "mean_token_accuracy": 0.7205681800842285, + "num_tokens": 346138071.0, + "step": 13677 + }, + { + "epoch": 1.5020865363496596, + "grad_norm": 2.6179165840148926, + "learning_rate": 1e-06, + "loss": 0.8092, + "mean_token_accuracy": 0.7370703220367432, + "num_tokens": 346157172.0, + "step": 13678 + }, + { + "epoch": 1.5021963540522734, + "grad_norm": 2.398038148880005, + "learning_rate": 1e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.7191616296768188, + "num_tokens": 346181688.0, + "step": 13679 + }, + { + "epoch": 1.502306171754887, + "grad_norm": 2.153681516647339, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.6949563026428223, + "num_tokens": 346211292.0, + "step": 13680 + }, + { + "epoch": 1.5024159894575004, + "grad_norm": 2.8630013465881348, + "learning_rate": 1e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7230105400085449, + "num_tokens": 346228648.0, + "step": 13681 + }, + { + "epoch": 1.5025258071601142, + "grad_norm": 2.017202854156494, + "learning_rate": 1e-06, + "loss": 0.8338, + "mean_token_accuracy": 0.7359856367111206, + "num_tokens": 346259087.0, + "step": 13682 + }, + { + "epoch": 1.502635624862728, + "grad_norm": 2.8228065967559814, + "learning_rate": 1e-06, + "loss": 0.7585, + "mean_token_accuracy": 0.7611521482467651, + "num_tokens": 346275828.0, + "step": 13683 + }, + { + "epoch": 1.5027454425653415, + "grad_norm": 2.235241651535034, + "learning_rate": 1e-06, + "loss": 0.8812, + "mean_token_accuracy": 0.7201913595199585, + "num_tokens": 346300259.0, + "step": 13684 + }, + { + "epoch": 1.5028552602679552, + "grad_norm": 2.691880941390991, + "learning_rate": 1e-06, + "loss": 0.9039, + "mean_token_accuracy": 0.7130131721496582, + "num_tokens": 346320866.0, + "step": 13685 + }, + { + "epoch": 1.5029650779705688, + "grad_norm": 2.1188831329345703, + "learning_rate": 1e-06, + "loss": 0.8259, + "mean_token_accuracy": 0.7433212399482727, + "num_tokens": 346348234.0, + "step": 13686 + }, + { + "epoch": 1.5030748956731825, + "grad_norm": 2.363588333129883, + "learning_rate": 1e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.7182400822639465, + "num_tokens": 346372011.0, + "step": 13687 + }, + { + "epoch": 1.5031847133757963, + "grad_norm": 2.6945457458496094, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7191126346588135, + "num_tokens": 346392470.0, + "step": 13688 + }, + { + "epoch": 1.5032945310784098, + "grad_norm": 2.628933906555176, + "learning_rate": 1e-06, + "loss": 0.8772, + "mean_token_accuracy": 0.7352854013442993, + "num_tokens": 346412969.0, + "step": 13689 + }, + { + "epoch": 1.5034043487810234, + "grad_norm": 2.4397575855255127, + "learning_rate": 1e-06, + "loss": 0.8566, + "mean_token_accuracy": 0.732061505317688, + "num_tokens": 346434050.0, + "step": 13690 + }, + { + "epoch": 1.5035141664836371, + "grad_norm": 2.2764413356781006, + "learning_rate": 1e-06, + "loss": 0.8866, + "mean_token_accuracy": 0.728589653968811, + "num_tokens": 346457736.0, + "step": 13691 + }, + { + "epoch": 1.5036239841862509, + "grad_norm": 2.006312370300293, + "learning_rate": 1e-06, + "loss": 0.8881, + "mean_token_accuracy": 0.7198197841644287, + "num_tokens": 346488623.0, + "step": 13692 + }, + { + "epoch": 1.5037338018888646, + "grad_norm": 2.0658679008483887, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7084197402000427, + "num_tokens": 346519616.0, + "step": 13693 + }, + { + "epoch": 1.5038436195914782, + "grad_norm": 2.312145471572876, + "learning_rate": 1e-06, + "loss": 0.8856, + "mean_token_accuracy": 0.7166826725006104, + "num_tokens": 346543768.0, + "step": 13694 + }, + { + "epoch": 1.5039534372940917, + "grad_norm": 2.3012642860412598, + "learning_rate": 1e-06, + "loss": 0.8574, + "mean_token_accuracy": 0.7295321226119995, + "num_tokens": 346568186.0, + "step": 13695 + }, + { + "epoch": 1.5040632549967055, + "grad_norm": 2.0914506912231445, + "learning_rate": 1e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.7096693515777588, + "num_tokens": 346596991.0, + "step": 13696 + }, + { + "epoch": 1.5041730726993192, + "grad_norm": 2.2554819583892822, + "learning_rate": 1e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7178232669830322, + "num_tokens": 346623815.0, + "step": 13697 + }, + { + "epoch": 1.5042828904019327, + "grad_norm": 2.120520830154419, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7091799974441528, + "num_tokens": 346653902.0, + "step": 13698 + }, + { + "epoch": 1.5043927081045465, + "grad_norm": 2.1956872940063477, + "learning_rate": 1e-06, + "loss": 0.8422, + "mean_token_accuracy": 0.7294324636459351, + "num_tokens": 346680683.0, + "step": 13699 + }, + { + "epoch": 1.50450252580716, + "grad_norm": 2.184410810470581, + "learning_rate": 1e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.7113988995552063, + "num_tokens": 346707296.0, + "step": 13700 + }, + { + "epoch": 1.5046123435097738, + "grad_norm": 1.9932175874710083, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.6984302997589111, + "num_tokens": 346738324.0, + "step": 13701 + }, + { + "epoch": 1.5047221612123876, + "grad_norm": 2.365114450454712, + "learning_rate": 1e-06, + "loss": 0.8875, + "mean_token_accuracy": 0.7211438417434692, + "num_tokens": 346762764.0, + "step": 13702 + }, + { + "epoch": 1.504831978915001, + "grad_norm": 2.1420090198516846, + "learning_rate": 1e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.7237306833267212, + "num_tokens": 346789667.0, + "step": 13703 + }, + { + "epoch": 1.5049417966176146, + "grad_norm": 2.440906047821045, + "learning_rate": 1e-06, + "loss": 0.8053, + "mean_token_accuracy": 0.7461875081062317, + "num_tokens": 346809954.0, + "step": 13704 + }, + { + "epoch": 1.5050516143202284, + "grad_norm": 2.582181453704834, + "learning_rate": 1e-06, + "loss": 0.8703, + "mean_token_accuracy": 0.728091835975647, + "num_tokens": 346829470.0, + "step": 13705 + }, + { + "epoch": 1.5051614320228421, + "grad_norm": 2.3597757816314697, + "learning_rate": 1e-06, + "loss": 0.8227, + "mean_token_accuracy": 0.7398332357406616, + "num_tokens": 346851455.0, + "step": 13706 + }, + { + "epoch": 1.505271249725456, + "grad_norm": 2.4600465297698975, + "learning_rate": 1e-06, + "loss": 0.9008, + "mean_token_accuracy": 0.7216320633888245, + "num_tokens": 346873818.0, + "step": 13707 + }, + { + "epoch": 1.5053810674280694, + "grad_norm": 2.44693660736084, + "learning_rate": 1e-06, + "loss": 0.8224, + "mean_token_accuracy": 0.7442941069602966, + "num_tokens": 346895415.0, + "step": 13708 + }, + { + "epoch": 1.505490885130683, + "grad_norm": 2.4380788803100586, + "learning_rate": 1e-06, + "loss": 0.892, + "mean_token_accuracy": 0.7202615737915039, + "num_tokens": 346918281.0, + "step": 13709 + }, + { + "epoch": 1.5056007028332967, + "grad_norm": 2.594148635864258, + "learning_rate": 1e-06, + "loss": 0.8852, + "mean_token_accuracy": 0.7226276993751526, + "num_tokens": 346938731.0, + "step": 13710 + }, + { + "epoch": 1.5057105205359105, + "grad_norm": 2.4373056888580322, + "learning_rate": 1e-06, + "loss": 0.8721, + "mean_token_accuracy": 0.7339239716529846, + "num_tokens": 346959361.0, + "step": 13711 + }, + { + "epoch": 1.505820338238524, + "grad_norm": 2.0818228721618652, + "learning_rate": 1e-06, + "loss": 0.8333, + "mean_token_accuracy": 0.7363027334213257, + "num_tokens": 346986688.0, + "step": 13712 + }, + { + "epoch": 1.5059301559411375, + "grad_norm": 2.370300054550171, + "learning_rate": 1e-06, + "loss": 0.8418, + "mean_token_accuracy": 0.7299745082855225, + "num_tokens": 347010650.0, + "step": 13713 + }, + { + "epoch": 1.5060399736437513, + "grad_norm": 2.254028558731079, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7070039510726929, + "num_tokens": 347037047.0, + "step": 13714 + }, + { + "epoch": 1.506149791346365, + "grad_norm": 2.1818368434906006, + "learning_rate": 1e-06, + "loss": 0.8691, + "mean_token_accuracy": 0.726475715637207, + "num_tokens": 347062092.0, + "step": 13715 + }, + { + "epoch": 1.5062596090489788, + "grad_norm": 2.42851185798645, + "learning_rate": 1e-06, + "loss": 0.8355, + "mean_token_accuracy": 0.7397321462631226, + "num_tokens": 347085101.0, + "step": 13716 + }, + { + "epoch": 1.5063694267515924, + "grad_norm": 2.5829033851623535, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.7193894386291504, + "num_tokens": 347106523.0, + "step": 13717 + }, + { + "epoch": 1.506479244454206, + "grad_norm": 1.9881646633148193, + "learning_rate": 1e-06, + "loss": 0.8418, + "mean_token_accuracy": 0.738319456577301, + "num_tokens": 347135436.0, + "step": 13718 + }, + { + "epoch": 1.5065890621568196, + "grad_norm": 2.142585039138794, + "learning_rate": 1e-06, + "loss": 0.9202, + "mean_token_accuracy": 0.7134541273117065, + "num_tokens": 347161638.0, + "step": 13719 + }, + { + "epoch": 1.5066988798594334, + "grad_norm": 2.232248544692993, + "learning_rate": 1e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.7144997119903564, + "num_tokens": 347186783.0, + "step": 13720 + }, + { + "epoch": 1.5068086975620472, + "grad_norm": 2.2416036128997803, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7198095321655273, + "num_tokens": 347212285.0, + "step": 13721 + }, + { + "epoch": 1.5069185152646607, + "grad_norm": 1.9331084489822388, + "learning_rate": 1e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7078347206115723, + "num_tokens": 347244253.0, + "step": 13722 + }, + { + "epoch": 1.5070283329672742, + "grad_norm": 2.7604477405548096, + "learning_rate": 1e-06, + "loss": 0.8458, + "mean_token_accuracy": 0.7305981516838074, + "num_tokens": 347263471.0, + "step": 13723 + }, + { + "epoch": 1.507138150669888, + "grad_norm": 2.2688536643981934, + "learning_rate": 1e-06, + "loss": 0.9308, + "mean_token_accuracy": 0.7141414880752563, + "num_tokens": 347289833.0, + "step": 13724 + }, + { + "epoch": 1.5072479683725017, + "grad_norm": 2.0922088623046875, + "learning_rate": 1e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.7150991559028625, + "num_tokens": 347317189.0, + "step": 13725 + }, + { + "epoch": 1.5073577860751153, + "grad_norm": 2.210965156555176, + "learning_rate": 1e-06, + "loss": 0.9011, + "mean_token_accuracy": 0.724357008934021, + "num_tokens": 347344674.0, + "step": 13726 + }, + { + "epoch": 1.5074676037777288, + "grad_norm": 2.3637890815734863, + "learning_rate": 1e-06, + "loss": 0.923, + "mean_token_accuracy": 0.712114691734314, + "num_tokens": 347370678.0, + "step": 13727 + }, + { + "epoch": 1.5075774214803426, + "grad_norm": 2.1015615463256836, + "learning_rate": 1e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.7142465114593506, + "num_tokens": 347399729.0, + "step": 13728 + }, + { + "epoch": 1.5076872391829563, + "grad_norm": 2.210766553878784, + "learning_rate": 1e-06, + "loss": 0.869, + "mean_token_accuracy": 0.7253129482269287, + "num_tokens": 347423742.0, + "step": 13729 + }, + { + "epoch": 1.50779705688557, + "grad_norm": 2.275888442993164, + "learning_rate": 1e-06, + "loss": 0.9025, + "mean_token_accuracy": 0.719901978969574, + "num_tokens": 347451460.0, + "step": 13730 + }, + { + "epoch": 1.5079068745881836, + "grad_norm": 2.0714573860168457, + "learning_rate": 1e-06, + "loss": 0.8059, + "mean_token_accuracy": 0.7472915649414062, + "num_tokens": 347481398.0, + "step": 13731 + }, + { + "epoch": 1.5080166922907972, + "grad_norm": 2.054130792617798, + "learning_rate": 1e-06, + "loss": 0.9107, + "mean_token_accuracy": 0.7125711441040039, + "num_tokens": 347511149.0, + "step": 13732 + }, + { + "epoch": 1.508126509993411, + "grad_norm": 2.606177806854248, + "learning_rate": 1e-06, + "loss": 0.9027, + "mean_token_accuracy": 0.729073703289032, + "num_tokens": 347531841.0, + "step": 13733 + }, + { + "epoch": 1.5082363276960247, + "grad_norm": 2.129303455352783, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7096692323684692, + "num_tokens": 347559493.0, + "step": 13734 + }, + { + "epoch": 1.5083461453986382, + "grad_norm": 2.27738881111145, + "learning_rate": 1e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.71614670753479, + "num_tokens": 347583700.0, + "step": 13735 + }, + { + "epoch": 1.508455963101252, + "grad_norm": 2.2576634883880615, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7106825113296509, + "num_tokens": 347611314.0, + "step": 13736 + }, + { + "epoch": 1.5085657808038655, + "grad_norm": 2.2964742183685303, + "learning_rate": 1e-06, + "loss": 1.0051, + "mean_token_accuracy": 0.6966099739074707, + "num_tokens": 347638534.0, + "step": 13737 + }, + { + "epoch": 1.5086755985064793, + "grad_norm": 2.2071924209594727, + "learning_rate": 1e-06, + "loss": 0.8613, + "mean_token_accuracy": 0.7309410572052002, + "num_tokens": 347663916.0, + "step": 13738 + }, + { + "epoch": 1.508785416209093, + "grad_norm": 2.1337854862213135, + "learning_rate": 1e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.7159533500671387, + "num_tokens": 347693836.0, + "step": 13739 + }, + { + "epoch": 1.5088952339117065, + "grad_norm": 1.8724071979522705, + "learning_rate": 1e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.6956901550292969, + "num_tokens": 347731818.0, + "step": 13740 + }, + { + "epoch": 1.50900505161432, + "grad_norm": 2.4780187606811523, + "learning_rate": 1e-06, + "loss": 0.8807, + "mean_token_accuracy": 0.7271959781646729, + "num_tokens": 347754051.0, + "step": 13741 + }, + { + "epoch": 1.5091148693169338, + "grad_norm": 2.355076789855957, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7104952931404114, + "num_tokens": 347778667.0, + "step": 13742 + }, + { + "epoch": 1.5092246870195476, + "grad_norm": 2.2232143878936768, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7121878862380981, + "num_tokens": 347805510.0, + "step": 13743 + }, + { + "epoch": 1.5093345047221614, + "grad_norm": 2.320544719696045, + "learning_rate": 1e-06, + "loss": 0.9595, + "mean_token_accuracy": 0.7008792161941528, + "num_tokens": 347830870.0, + "step": 13744 + }, + { + "epoch": 1.5094443224247749, + "grad_norm": 2.3001749515533447, + "learning_rate": 1e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.7187505960464478, + "num_tokens": 347855536.0, + "step": 13745 + }, + { + "epoch": 1.5095541401273884, + "grad_norm": 2.244239330291748, + "learning_rate": 1e-06, + "loss": 0.8707, + "mean_token_accuracy": 0.7234318256378174, + "num_tokens": 347880728.0, + "step": 13746 + }, + { + "epoch": 1.5096639578300022, + "grad_norm": 2.107212543487549, + "learning_rate": 1e-06, + "loss": 0.8624, + "mean_token_accuracy": 0.7296937108039856, + "num_tokens": 347907275.0, + "step": 13747 + }, + { + "epoch": 1.509773775532616, + "grad_norm": 2.1116082668304443, + "learning_rate": 1e-06, + "loss": 0.9077, + "mean_token_accuracy": 0.7210675477981567, + "num_tokens": 347937181.0, + "step": 13748 + }, + { + "epoch": 1.5098835932352295, + "grad_norm": 2.2228384017944336, + "learning_rate": 1e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.719354510307312, + "num_tokens": 347963197.0, + "step": 13749 + }, + { + "epoch": 1.5099934109378432, + "grad_norm": 2.346597671508789, + "learning_rate": 1e-06, + "loss": 0.8358, + "mean_token_accuracy": 0.7431020736694336, + "num_tokens": 347987098.0, + "step": 13750 + }, + { + "epoch": 1.5101032286404568, + "grad_norm": 2.203291654586792, + "learning_rate": 1e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.7172529697418213, + "num_tokens": 348014897.0, + "step": 13751 + }, + { + "epoch": 1.5102130463430705, + "grad_norm": 2.1176464557647705, + "learning_rate": 1e-06, + "loss": 0.9011, + "mean_token_accuracy": 0.7237896919250488, + "num_tokens": 348043970.0, + "step": 13752 + }, + { + "epoch": 1.5103228640456843, + "grad_norm": 2.319535493850708, + "learning_rate": 1e-06, + "loss": 0.8075, + "mean_token_accuracy": 0.746323823928833, + "num_tokens": 348067299.0, + "step": 13753 + }, + { + "epoch": 1.5104326817482978, + "grad_norm": 2.0883729457855225, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.7045904397964478, + "num_tokens": 348097971.0, + "step": 13754 + }, + { + "epoch": 1.5105424994509113, + "grad_norm": 2.068375825881958, + "learning_rate": 1e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7203260064125061, + "num_tokens": 348125386.0, + "step": 13755 + }, + { + "epoch": 1.510652317153525, + "grad_norm": 2.555035352706909, + "learning_rate": 1e-06, + "loss": 0.8433, + "mean_token_accuracy": 0.7368125915527344, + "num_tokens": 348146782.0, + "step": 13756 + }, + { + "epoch": 1.5107621348561389, + "grad_norm": 2.3787529468536377, + "learning_rate": 1e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.7180598974227905, + "num_tokens": 348168944.0, + "step": 13757 + }, + { + "epoch": 1.5108719525587526, + "grad_norm": 2.0502090454101562, + "learning_rate": 1e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7041773796081543, + "num_tokens": 348196695.0, + "step": 13758 + }, + { + "epoch": 1.5109817702613662, + "grad_norm": 2.586402416229248, + "learning_rate": 1e-06, + "loss": 0.7376, + "mean_token_accuracy": 0.7608561515808105, + "num_tokens": 348216963.0, + "step": 13759 + }, + { + "epoch": 1.5110915879639797, + "grad_norm": 2.2335169315338135, + "learning_rate": 1e-06, + "loss": 1.0659, + "mean_token_accuracy": 0.6782379150390625, + "num_tokens": 348244528.0, + "step": 13760 + }, + { + "epoch": 1.5112014056665934, + "grad_norm": 2.4314804077148438, + "learning_rate": 1e-06, + "loss": 0.7761, + "mean_token_accuracy": 0.7455737590789795, + "num_tokens": 348265526.0, + "step": 13761 + }, + { + "epoch": 1.5113112233692072, + "grad_norm": 2.4763755798339844, + "learning_rate": 1e-06, + "loss": 0.8379, + "mean_token_accuracy": 0.7353023290634155, + "num_tokens": 348289008.0, + "step": 13762 + }, + { + "epoch": 1.5114210410718207, + "grad_norm": 2.1467108726501465, + "learning_rate": 1e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.6912757158279419, + "num_tokens": 348318161.0, + "step": 13763 + }, + { + "epoch": 1.5115308587744345, + "grad_norm": 2.2775704860687256, + "learning_rate": 1e-06, + "loss": 0.8911, + "mean_token_accuracy": 0.7211503982543945, + "num_tokens": 348344022.0, + "step": 13764 + }, + { + "epoch": 1.511640676477048, + "grad_norm": 2.1871511936187744, + "learning_rate": 1e-06, + "loss": 0.9046, + "mean_token_accuracy": 0.7221710681915283, + "num_tokens": 348371258.0, + "step": 13765 + }, + { + "epoch": 1.5117504941796618, + "grad_norm": 2.2103779315948486, + "learning_rate": 1e-06, + "loss": 0.8322, + "mean_token_accuracy": 0.7380993366241455, + "num_tokens": 348396177.0, + "step": 13766 + }, + { + "epoch": 1.5118603118822755, + "grad_norm": 2.3205716609954834, + "learning_rate": 1e-06, + "loss": 0.8772, + "mean_token_accuracy": 0.7278823256492615, + "num_tokens": 348420476.0, + "step": 13767 + }, + { + "epoch": 1.511970129584889, + "grad_norm": 1.9771130084991455, + "learning_rate": 1e-06, + "loss": 0.8675, + "mean_token_accuracy": 0.7254602909088135, + "num_tokens": 348450713.0, + "step": 13768 + }, + { + "epoch": 1.5120799472875026, + "grad_norm": 2.58042573928833, + "learning_rate": 1e-06, + "loss": 0.8854, + "mean_token_accuracy": 0.7219123840332031, + "num_tokens": 348472077.0, + "step": 13769 + }, + { + "epoch": 1.5121897649901164, + "grad_norm": 2.3837087154388428, + "learning_rate": 1e-06, + "loss": 0.8896, + "mean_token_accuracy": 0.7215942144393921, + "num_tokens": 348495855.0, + "step": 13770 + }, + { + "epoch": 1.5122995826927301, + "grad_norm": 1.9530715942382812, + "learning_rate": 1e-06, + "loss": 0.8961, + "mean_token_accuracy": 0.7159085869789124, + "num_tokens": 348529056.0, + "step": 13771 + }, + { + "epoch": 1.5124094003953439, + "grad_norm": 2.391117811203003, + "learning_rate": 1e-06, + "loss": 0.9027, + "mean_token_accuracy": 0.7209553718566895, + "num_tokens": 348551330.0, + "step": 13772 + }, + { + "epoch": 1.5125192180979574, + "grad_norm": 2.114562511444092, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.7035836577415466, + "num_tokens": 348579081.0, + "step": 13773 + }, + { + "epoch": 1.512629035800571, + "grad_norm": 2.197063684463501, + "learning_rate": 1e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.7129082679748535, + "num_tokens": 348607650.0, + "step": 13774 + }, + { + "epoch": 1.5127388535031847, + "grad_norm": 2.5560994148254395, + "learning_rate": 1e-06, + "loss": 0.8388, + "mean_token_accuracy": 0.730771541595459, + "num_tokens": 348627636.0, + "step": 13775 + }, + { + "epoch": 1.5128486712057985, + "grad_norm": 1.8929986953735352, + "learning_rate": 1e-06, + "loss": 0.9242, + "mean_token_accuracy": 0.7199908494949341, + "num_tokens": 348659480.0, + "step": 13776 + }, + { + "epoch": 1.512958488908412, + "grad_norm": 2.1218860149383545, + "learning_rate": 1e-06, + "loss": 0.8818, + "mean_token_accuracy": 0.7266485095024109, + "num_tokens": 348687057.0, + "step": 13777 + }, + { + "epoch": 1.5130683066110255, + "grad_norm": 2.5053110122680664, + "learning_rate": 1e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.7143294811248779, + "num_tokens": 348709799.0, + "step": 13778 + }, + { + "epoch": 1.5131781243136393, + "grad_norm": 2.5127265453338623, + "learning_rate": 1e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.7104458808898926, + "num_tokens": 348732018.0, + "step": 13779 + }, + { + "epoch": 1.513287942016253, + "grad_norm": 2.447525978088379, + "learning_rate": 1e-06, + "loss": 0.8518, + "mean_token_accuracy": 0.7336903810501099, + "num_tokens": 348754559.0, + "step": 13780 + }, + { + "epoch": 1.5133977597188668, + "grad_norm": 2.496958017349243, + "learning_rate": 1e-06, + "loss": 0.7813, + "mean_token_accuracy": 0.7502855062484741, + "num_tokens": 348776378.0, + "step": 13781 + }, + { + "epoch": 1.5135075774214803, + "grad_norm": 2.3483834266662598, + "learning_rate": 1e-06, + "loss": 0.8505, + "mean_token_accuracy": 0.7233089804649353, + "num_tokens": 348800198.0, + "step": 13782 + }, + { + "epoch": 1.5136173951240939, + "grad_norm": 2.076526641845703, + "learning_rate": 1e-06, + "loss": 0.8479, + "mean_token_accuracy": 0.7307034730911255, + "num_tokens": 348829455.0, + "step": 13783 + }, + { + "epoch": 1.5137272128267076, + "grad_norm": 2.5565850734710693, + "learning_rate": 1e-06, + "loss": 0.8757, + "mean_token_accuracy": 0.7291753888130188, + "num_tokens": 348850060.0, + "step": 13784 + }, + { + "epoch": 1.5138370305293214, + "grad_norm": 1.9487699270248413, + "learning_rate": 1e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.6972401738166809, + "num_tokens": 348880829.0, + "step": 13785 + }, + { + "epoch": 1.5139468482319351, + "grad_norm": 2.3015120029449463, + "learning_rate": 1e-06, + "loss": 0.8558, + "mean_token_accuracy": 0.741396427154541, + "num_tokens": 348905966.0, + "step": 13786 + }, + { + "epoch": 1.5140566659345487, + "grad_norm": 2.484205484390259, + "learning_rate": 1e-06, + "loss": 0.8686, + "mean_token_accuracy": 0.7315685749053955, + "num_tokens": 348926840.0, + "step": 13787 + }, + { + "epoch": 1.5141664836371622, + "grad_norm": 2.389880895614624, + "learning_rate": 1e-06, + "loss": 0.9565, + "mean_token_accuracy": 0.7078973054885864, + "num_tokens": 348951449.0, + "step": 13788 + }, + { + "epoch": 1.514276301339776, + "grad_norm": 2.2464799880981445, + "learning_rate": 1e-06, + "loss": 0.8861, + "mean_token_accuracy": 0.723851203918457, + "num_tokens": 348978037.0, + "step": 13789 + }, + { + "epoch": 1.5143861190423897, + "grad_norm": 2.0043320655822754, + "learning_rate": 1e-06, + "loss": 0.8668, + "mean_token_accuracy": 0.7281156778335571, + "num_tokens": 349007847.0, + "step": 13790 + }, + { + "epoch": 1.5144959367450033, + "grad_norm": 2.297884941101074, + "learning_rate": 1e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.7031861543655396, + "num_tokens": 349031979.0, + "step": 13791 + }, + { + "epoch": 1.5146057544476168, + "grad_norm": 2.1112868785858154, + "learning_rate": 1e-06, + "loss": 1.0022, + "mean_token_accuracy": 0.7021386027336121, + "num_tokens": 349059827.0, + "step": 13792 + }, + { + "epoch": 1.5147155721502306, + "grad_norm": 2.2929484844207764, + "learning_rate": 1e-06, + "loss": 0.944, + "mean_token_accuracy": 0.7118244171142578, + "num_tokens": 349085370.0, + "step": 13793 + }, + { + "epoch": 1.5148253898528443, + "grad_norm": 2.0201518535614014, + "learning_rate": 1e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.7055164575576782, + "num_tokens": 349119491.0, + "step": 13794 + }, + { + "epoch": 1.514935207555458, + "grad_norm": 2.329796314239502, + "learning_rate": 1e-06, + "loss": 0.8376, + "mean_token_accuracy": 0.7270418405532837, + "num_tokens": 349142863.0, + "step": 13795 + }, + { + "epoch": 1.5150450252580716, + "grad_norm": 2.3687825202941895, + "learning_rate": 1e-06, + "loss": 0.8024, + "mean_token_accuracy": 0.741466224193573, + "num_tokens": 349166652.0, + "step": 13796 + }, + { + "epoch": 1.5151548429606851, + "grad_norm": 2.286635160446167, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.7046371698379517, + "num_tokens": 349195742.0, + "step": 13797 + }, + { + "epoch": 1.515264660663299, + "grad_norm": 2.192323923110962, + "learning_rate": 1e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.7202990055084229, + "num_tokens": 349220197.0, + "step": 13798 + }, + { + "epoch": 1.5153744783659127, + "grad_norm": 2.601696014404297, + "learning_rate": 1e-06, + "loss": 0.8052, + "mean_token_accuracy": 0.7454385757446289, + "num_tokens": 349239301.0, + "step": 13799 + }, + { + "epoch": 1.5154842960685262, + "grad_norm": 2.1965949535369873, + "learning_rate": 1e-06, + "loss": 0.9225, + "mean_token_accuracy": 0.7229585647583008, + "num_tokens": 349264363.0, + "step": 13800 + }, + { + "epoch": 1.51559411377114, + "grad_norm": 2.235579490661621, + "learning_rate": 1e-06, + "loss": 0.9159, + "mean_token_accuracy": 0.717581033706665, + "num_tokens": 349289087.0, + "step": 13801 + }, + { + "epoch": 1.5157039314737535, + "grad_norm": 2.2588846683502197, + "learning_rate": 1e-06, + "loss": 0.922, + "mean_token_accuracy": 0.713587760925293, + "num_tokens": 349316222.0, + "step": 13802 + }, + { + "epoch": 1.5158137491763672, + "grad_norm": 2.20371675491333, + "learning_rate": 1e-06, + "loss": 0.894, + "mean_token_accuracy": 0.7210959196090698, + "num_tokens": 349342303.0, + "step": 13803 + }, + { + "epoch": 1.515923566878981, + "grad_norm": 2.0775551795959473, + "learning_rate": 1e-06, + "loss": 0.943, + "mean_token_accuracy": 0.7107940316200256, + "num_tokens": 349371198.0, + "step": 13804 + }, + { + "epoch": 1.5160333845815945, + "grad_norm": 2.073718786239624, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.711553692817688, + "num_tokens": 349400358.0, + "step": 13805 + }, + { + "epoch": 1.516143202284208, + "grad_norm": 2.5211710929870605, + "learning_rate": 1e-06, + "loss": 0.8829, + "mean_token_accuracy": 0.7351889610290527, + "num_tokens": 349421353.0, + "step": 13806 + }, + { + "epoch": 1.5162530199868218, + "grad_norm": 2.2525136470794678, + "learning_rate": 1e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.7188068628311157, + "num_tokens": 349447273.0, + "step": 13807 + }, + { + "epoch": 1.5163628376894356, + "grad_norm": 2.341235399246216, + "learning_rate": 1e-06, + "loss": 0.9035, + "mean_token_accuracy": 0.7244957685470581, + "num_tokens": 349470218.0, + "step": 13808 + }, + { + "epoch": 1.5164726553920493, + "grad_norm": 2.3800623416900635, + "learning_rate": 1e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7199263572692871, + "num_tokens": 349494211.0, + "step": 13809 + }, + { + "epoch": 1.5165824730946629, + "grad_norm": 2.941011428833008, + "learning_rate": 1e-06, + "loss": 0.8018, + "mean_token_accuracy": 0.7454656958580017, + "num_tokens": 349511743.0, + "step": 13810 + }, + { + "epoch": 1.5166922907972764, + "grad_norm": 2.371933698654175, + "learning_rate": 1e-06, + "loss": 0.8775, + "mean_token_accuracy": 0.7213908433914185, + "num_tokens": 349535115.0, + "step": 13811 + }, + { + "epoch": 1.5168021084998902, + "grad_norm": 2.3232474327087402, + "learning_rate": 1e-06, + "loss": 0.8039, + "mean_token_accuracy": 0.7406543493270874, + "num_tokens": 349558836.0, + "step": 13812 + }, + { + "epoch": 1.516911926202504, + "grad_norm": 2.099752426147461, + "learning_rate": 1e-06, + "loss": 0.9704, + "mean_token_accuracy": 0.7103849053382874, + "num_tokens": 349586878.0, + "step": 13813 + }, + { + "epoch": 1.5170217439051175, + "grad_norm": 2.1769790649414062, + "learning_rate": 1e-06, + "loss": 0.8674, + "mean_token_accuracy": 0.7331528067588806, + "num_tokens": 349614198.0, + "step": 13814 + }, + { + "epoch": 1.5171315616077312, + "grad_norm": 2.248279333114624, + "learning_rate": 1e-06, + "loss": 0.9253, + "mean_token_accuracy": 0.708251953125, + "num_tokens": 349639377.0, + "step": 13815 + }, + { + "epoch": 1.5172413793103448, + "grad_norm": 2.319493532180786, + "learning_rate": 1e-06, + "loss": 0.8516, + "mean_token_accuracy": 0.7268218994140625, + "num_tokens": 349663631.0, + "step": 13816 + }, + { + "epoch": 1.5173511970129585, + "grad_norm": 2.105844259262085, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7134698629379272, + "num_tokens": 349695195.0, + "step": 13817 + }, + { + "epoch": 1.5174610147155723, + "grad_norm": 2.244694471359253, + "learning_rate": 1e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.7125221490859985, + "num_tokens": 349722184.0, + "step": 13818 + }, + { + "epoch": 1.5175708324181858, + "grad_norm": 2.2810752391815186, + "learning_rate": 1e-06, + "loss": 0.881, + "mean_token_accuracy": 0.720971405506134, + "num_tokens": 349746389.0, + "step": 13819 + }, + { + "epoch": 1.5176806501207993, + "grad_norm": 2.3019402027130127, + "learning_rate": 1e-06, + "loss": 0.87, + "mean_token_accuracy": 0.7260857820510864, + "num_tokens": 349769964.0, + "step": 13820 + }, + { + "epoch": 1.517790467823413, + "grad_norm": 2.2043628692626953, + "learning_rate": 1e-06, + "loss": 0.8331, + "mean_token_accuracy": 0.7386458516120911, + "num_tokens": 349796761.0, + "step": 13821 + }, + { + "epoch": 1.5179002855260268, + "grad_norm": 2.1263039112091064, + "learning_rate": 1e-06, + "loss": 0.8809, + "mean_token_accuracy": 0.7300589084625244, + "num_tokens": 349824092.0, + "step": 13822 + }, + { + "epoch": 1.5180101032286406, + "grad_norm": 2.0978057384490967, + "learning_rate": 1e-06, + "loss": 0.8984, + "mean_token_accuracy": 0.7266183495521545, + "num_tokens": 349853225.0, + "step": 13823 + }, + { + "epoch": 1.5181199209312541, + "grad_norm": 2.7728686332702637, + "learning_rate": 1e-06, + "loss": 0.788, + "mean_token_accuracy": 0.749951183795929, + "num_tokens": 349871813.0, + "step": 13824 + }, + { + "epoch": 1.5182297386338677, + "grad_norm": 2.719083309173584, + "learning_rate": 1e-06, + "loss": 0.8923, + "mean_token_accuracy": 0.7165113091468811, + "num_tokens": 349891276.0, + "step": 13825 + }, + { + "epoch": 1.5183395563364814, + "grad_norm": 2.2380270957946777, + "learning_rate": 1e-06, + "loss": 0.8854, + "mean_token_accuracy": 0.7221935987472534, + "num_tokens": 349917834.0, + "step": 13826 + }, + { + "epoch": 1.5184493740390952, + "grad_norm": 2.3970489501953125, + "learning_rate": 1e-06, + "loss": 0.7995, + "mean_token_accuracy": 0.7477775812149048, + "num_tokens": 349940149.0, + "step": 13827 + }, + { + "epoch": 1.5185591917417087, + "grad_norm": 2.1411757469177246, + "learning_rate": 1e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.72182297706604, + "num_tokens": 349966757.0, + "step": 13828 + }, + { + "epoch": 1.5186690094443223, + "grad_norm": 2.4998233318328857, + "learning_rate": 1e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.7029308080673218, + "num_tokens": 349988904.0, + "step": 13829 + }, + { + "epoch": 1.518778827146936, + "grad_norm": 2.517096519470215, + "learning_rate": 1e-06, + "loss": 0.7827, + "mean_token_accuracy": 0.7480294704437256, + "num_tokens": 350009049.0, + "step": 13830 + }, + { + "epoch": 1.5188886448495498, + "grad_norm": 2.3142592906951904, + "learning_rate": 1e-06, + "loss": 0.8236, + "mean_token_accuracy": 0.7405604124069214, + "num_tokens": 350031730.0, + "step": 13831 + }, + { + "epoch": 1.5189984625521635, + "grad_norm": 2.164970636367798, + "learning_rate": 1e-06, + "loss": 0.8358, + "mean_token_accuracy": 0.7415201663970947, + "num_tokens": 350057586.0, + "step": 13832 + }, + { + "epoch": 1.519108280254777, + "grad_norm": 2.660804271697998, + "learning_rate": 1e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7206112742424011, + "num_tokens": 350078927.0, + "step": 13833 + }, + { + "epoch": 1.5192180979573906, + "grad_norm": 2.361224889755249, + "learning_rate": 1e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.7127618193626404, + "num_tokens": 350104159.0, + "step": 13834 + }, + { + "epoch": 1.5193279156600044, + "grad_norm": 2.328223705291748, + "learning_rate": 1e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.7188366651535034, + "num_tokens": 350128337.0, + "step": 13835 + }, + { + "epoch": 1.5194377333626181, + "grad_norm": 2.2467072010040283, + "learning_rate": 1e-06, + "loss": 0.8462, + "mean_token_accuracy": 0.7344161868095398, + "num_tokens": 350151339.0, + "step": 13836 + }, + { + "epoch": 1.5195475510652319, + "grad_norm": 2.3400843143463135, + "learning_rate": 1e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.7213300466537476, + "num_tokens": 350174496.0, + "step": 13837 + }, + { + "epoch": 1.5196573687678454, + "grad_norm": 2.370298385620117, + "learning_rate": 1e-06, + "loss": 0.7993, + "mean_token_accuracy": 0.7417049407958984, + "num_tokens": 350195659.0, + "step": 13838 + }, + { + "epoch": 1.519767186470459, + "grad_norm": 2.297158718109131, + "learning_rate": 1e-06, + "loss": 0.9082, + "mean_token_accuracy": 0.7163234949111938, + "num_tokens": 350222003.0, + "step": 13839 + }, + { + "epoch": 1.5198770041730727, + "grad_norm": 2.3856637477874756, + "learning_rate": 1e-06, + "loss": 0.8834, + "mean_token_accuracy": 0.7201710939407349, + "num_tokens": 350243432.0, + "step": 13840 + }, + { + "epoch": 1.5199868218756865, + "grad_norm": 2.0430808067321777, + "learning_rate": 1e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.7231333255767822, + "num_tokens": 350273193.0, + "step": 13841 + }, + { + "epoch": 1.5200966395783, + "grad_norm": 2.203045129776001, + "learning_rate": 1e-06, + "loss": 1.0063, + "mean_token_accuracy": 0.6901416182518005, + "num_tokens": 350300095.0, + "step": 13842 + }, + { + "epoch": 1.5202064572809135, + "grad_norm": 2.224027156829834, + "learning_rate": 1e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.700719952583313, + "num_tokens": 350326299.0, + "step": 13843 + }, + { + "epoch": 1.5203162749835273, + "grad_norm": 2.1709532737731934, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7101329565048218, + "num_tokens": 350354345.0, + "step": 13844 + }, + { + "epoch": 1.520426092686141, + "grad_norm": 2.3557567596435547, + "learning_rate": 1e-06, + "loss": 0.8427, + "mean_token_accuracy": 0.7447364926338196, + "num_tokens": 350376509.0, + "step": 13845 + }, + { + "epoch": 1.5205359103887548, + "grad_norm": 2.3492321968078613, + "learning_rate": 1e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7083126306533813, + "num_tokens": 350403006.0, + "step": 13846 + }, + { + "epoch": 1.5206457280913683, + "grad_norm": 2.228109359741211, + "learning_rate": 1e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.7149083614349365, + "num_tokens": 350428360.0, + "step": 13847 + }, + { + "epoch": 1.5207555457939819, + "grad_norm": 2.0750575065612793, + "learning_rate": 1e-06, + "loss": 0.8908, + "mean_token_accuracy": 0.7140887975692749, + "num_tokens": 350457979.0, + "step": 13848 + }, + { + "epoch": 1.5208653634965956, + "grad_norm": 2.3243563175201416, + "learning_rate": 1e-06, + "loss": 0.9462, + "mean_token_accuracy": 0.7035930156707764, + "num_tokens": 350484849.0, + "step": 13849 + }, + { + "epoch": 1.5209751811992094, + "grad_norm": 2.372856855392456, + "learning_rate": 1e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7255865931510925, + "num_tokens": 350507457.0, + "step": 13850 + }, + { + "epoch": 1.5210849989018231, + "grad_norm": 2.020469903945923, + "learning_rate": 1e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.6949065923690796, + "num_tokens": 350540511.0, + "step": 13851 + }, + { + "epoch": 1.5211948166044367, + "grad_norm": 2.4030842781066895, + "learning_rate": 1e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7195253968238831, + "num_tokens": 350564644.0, + "step": 13852 + }, + { + "epoch": 1.5213046343070502, + "grad_norm": 2.3740732669830322, + "learning_rate": 1e-06, + "loss": 0.8242, + "mean_token_accuracy": 0.7300213575363159, + "num_tokens": 350587440.0, + "step": 13853 + }, + { + "epoch": 1.521414452009664, + "grad_norm": 2.4624717235565186, + "learning_rate": 1e-06, + "loss": 0.9923, + "mean_token_accuracy": 0.6960625648498535, + "num_tokens": 350610579.0, + "step": 13854 + }, + { + "epoch": 1.5215242697122777, + "grad_norm": 2.0560104846954346, + "learning_rate": 1e-06, + "loss": 0.9097, + "mean_token_accuracy": 0.7265959978103638, + "num_tokens": 350641422.0, + "step": 13855 + }, + { + "epoch": 1.5216340874148913, + "grad_norm": 2.729076623916626, + "learning_rate": 1e-06, + "loss": 0.8323, + "mean_token_accuracy": 0.741629958152771, + "num_tokens": 350659407.0, + "step": 13856 + }, + { + "epoch": 1.5217439051175048, + "grad_norm": 2.639437437057495, + "learning_rate": 1e-06, + "loss": 0.8528, + "mean_token_accuracy": 0.7325737476348877, + "num_tokens": 350679247.0, + "step": 13857 + }, + { + "epoch": 1.5218537228201185, + "grad_norm": 2.423781633377075, + "learning_rate": 1e-06, + "loss": 0.9048, + "mean_token_accuracy": 0.7214739322662354, + "num_tokens": 350702246.0, + "step": 13858 + }, + { + "epoch": 1.5219635405227323, + "grad_norm": 2.251068592071533, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7161659598350525, + "num_tokens": 350727758.0, + "step": 13859 + }, + { + "epoch": 1.522073358225346, + "grad_norm": 2.22770357131958, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7058317065238953, + "num_tokens": 350754204.0, + "step": 13860 + }, + { + "epoch": 1.5221831759279596, + "grad_norm": 2.3444926738739014, + "learning_rate": 1e-06, + "loss": 0.9628, + "mean_token_accuracy": 0.7048062682151794, + "num_tokens": 350779173.0, + "step": 13861 + }, + { + "epoch": 1.5222929936305731, + "grad_norm": 2.2818856239318848, + "learning_rate": 1e-06, + "loss": 0.7919, + "mean_token_accuracy": 0.7532130479812622, + "num_tokens": 350800999.0, + "step": 13862 + }, + { + "epoch": 1.522402811333187, + "grad_norm": 2.445770025253296, + "learning_rate": 1e-06, + "loss": 0.9352, + "mean_token_accuracy": 0.7128823399543762, + "num_tokens": 350822604.0, + "step": 13863 + }, + { + "epoch": 1.5225126290358006, + "grad_norm": 1.985457181930542, + "learning_rate": 1e-06, + "loss": 0.9168, + "mean_token_accuracy": 0.7133656144142151, + "num_tokens": 350853450.0, + "step": 13864 + }, + { + "epoch": 1.5226224467384142, + "grad_norm": 1.996132254600525, + "learning_rate": 1e-06, + "loss": 0.9487, + "mean_token_accuracy": 0.7017098665237427, + "num_tokens": 350883950.0, + "step": 13865 + }, + { + "epoch": 1.522732264441028, + "grad_norm": 2.2647528648376465, + "learning_rate": 1e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.7110096216201782, + "num_tokens": 350908278.0, + "step": 13866 + }, + { + "epoch": 1.5228420821436415, + "grad_norm": 2.1442015171051025, + "learning_rate": 1e-06, + "loss": 1.0117, + "mean_token_accuracy": 0.6979950666427612, + "num_tokens": 350935949.0, + "step": 13867 + }, + { + "epoch": 1.5229518998462552, + "grad_norm": 2.168423652648926, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7101136445999146, + "num_tokens": 350962166.0, + "step": 13868 + }, + { + "epoch": 1.523061717548869, + "grad_norm": 2.3628318309783936, + "learning_rate": 1e-06, + "loss": 0.8584, + "mean_token_accuracy": 0.738118588924408, + "num_tokens": 350983640.0, + "step": 13869 + }, + { + "epoch": 1.5231715352514825, + "grad_norm": 2.5787417888641357, + "learning_rate": 1e-06, + "loss": 0.8355, + "mean_token_accuracy": 0.7339320182800293, + "num_tokens": 351004746.0, + "step": 13870 + }, + { + "epoch": 1.523281352954096, + "grad_norm": 2.5485403537750244, + "learning_rate": 1e-06, + "loss": 0.8719, + "mean_token_accuracy": 0.7308608293533325, + "num_tokens": 351026198.0, + "step": 13871 + }, + { + "epoch": 1.5233911706567098, + "grad_norm": 2.3969972133636475, + "learning_rate": 1e-06, + "loss": 0.8653, + "mean_token_accuracy": 0.7301750779151917, + "num_tokens": 351047480.0, + "step": 13872 + }, + { + "epoch": 1.5235009883593236, + "grad_norm": 2.2883176803588867, + "learning_rate": 1e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.7055410146713257, + "num_tokens": 351074382.0, + "step": 13873 + }, + { + "epoch": 1.5236108060619373, + "grad_norm": 2.3623290061950684, + "learning_rate": 1e-06, + "loss": 0.8359, + "mean_token_accuracy": 0.7358956336975098, + "num_tokens": 351096571.0, + "step": 13874 + }, + { + "epoch": 1.5237206237645509, + "grad_norm": 2.3846945762634277, + "learning_rate": 1e-06, + "loss": 0.8962, + "mean_token_accuracy": 0.7283446192741394, + "num_tokens": 351120185.0, + "step": 13875 + }, + { + "epoch": 1.5238304414671644, + "grad_norm": 2.3626644611358643, + "learning_rate": 1e-06, + "loss": 0.9709, + "mean_token_accuracy": 0.7045512795448303, + "num_tokens": 351148666.0, + "step": 13876 + }, + { + "epoch": 1.5239402591697782, + "grad_norm": 2.603074789047241, + "learning_rate": 1e-06, + "loss": 0.8731, + "mean_token_accuracy": 0.7250726819038391, + "num_tokens": 351168337.0, + "step": 13877 + }, + { + "epoch": 1.524050076872392, + "grad_norm": 2.2888615131378174, + "learning_rate": 1e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.7101132273674011, + "num_tokens": 351193129.0, + "step": 13878 + }, + { + "epoch": 1.5241598945750054, + "grad_norm": 2.0578014850616455, + "learning_rate": 1e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.6990047693252563, + "num_tokens": 351223980.0, + "step": 13879 + }, + { + "epoch": 1.5242697122776192, + "grad_norm": 2.297668218612671, + "learning_rate": 1e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.702326774597168, + "num_tokens": 351247152.0, + "step": 13880 + }, + { + "epoch": 1.5243795299802327, + "grad_norm": 2.287644386291504, + "learning_rate": 1e-06, + "loss": 0.7336, + "mean_token_accuracy": 0.7583907246589661, + "num_tokens": 351268538.0, + "step": 13881 + }, + { + "epoch": 1.5244893476828465, + "grad_norm": 2.3366568088531494, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7143310308456421, + "num_tokens": 351292382.0, + "step": 13882 + }, + { + "epoch": 1.5245991653854603, + "grad_norm": 2.4539847373962402, + "learning_rate": 1e-06, + "loss": 0.9786, + "mean_token_accuracy": 0.704015851020813, + "num_tokens": 351314472.0, + "step": 13883 + }, + { + "epoch": 1.5247089830880738, + "grad_norm": 2.3319811820983887, + "learning_rate": 1e-06, + "loss": 0.8282, + "mean_token_accuracy": 0.7349157929420471, + "num_tokens": 351336233.0, + "step": 13884 + }, + { + "epoch": 1.5248188007906873, + "grad_norm": 2.308925151824951, + "learning_rate": 1e-06, + "loss": 0.8617, + "mean_token_accuracy": 0.7279157638549805, + "num_tokens": 351360886.0, + "step": 13885 + }, + { + "epoch": 1.524928618493301, + "grad_norm": 2.425668716430664, + "learning_rate": 1e-06, + "loss": 0.7753, + "mean_token_accuracy": 0.7502784729003906, + "num_tokens": 351381843.0, + "step": 13886 + }, + { + "epoch": 1.5250384361959148, + "grad_norm": 2.4137229919433594, + "learning_rate": 1e-06, + "loss": 0.8809, + "mean_token_accuracy": 0.7246575355529785, + "num_tokens": 351402876.0, + "step": 13887 + }, + { + "epoch": 1.5251482538985286, + "grad_norm": 2.319298505783081, + "learning_rate": 1e-06, + "loss": 0.9027, + "mean_token_accuracy": 0.7147795557975769, + "num_tokens": 351427189.0, + "step": 13888 + }, + { + "epoch": 1.5252580716011421, + "grad_norm": 2.305866003036499, + "learning_rate": 1e-06, + "loss": 0.8857, + "mean_token_accuracy": 0.7293999195098877, + "num_tokens": 351451369.0, + "step": 13889 + }, + { + "epoch": 1.5253678893037557, + "grad_norm": 2.228973150253296, + "learning_rate": 1e-06, + "loss": 0.9266, + "mean_token_accuracy": 0.7149238586425781, + "num_tokens": 351477731.0, + "step": 13890 + }, + { + "epoch": 1.5254777070063694, + "grad_norm": 2.353196859359741, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.6982384920120239, + "num_tokens": 351501933.0, + "step": 13891 + }, + { + "epoch": 1.5255875247089832, + "grad_norm": 2.4202396869659424, + "learning_rate": 1e-06, + "loss": 0.9051, + "mean_token_accuracy": 0.7165907621383667, + "num_tokens": 351525087.0, + "step": 13892 + }, + { + "epoch": 1.5256973424115967, + "grad_norm": 2.5080959796905518, + "learning_rate": 1e-06, + "loss": 0.8816, + "mean_token_accuracy": 0.7232615947723389, + "num_tokens": 351546982.0, + "step": 13893 + }, + { + "epoch": 1.5258071601142102, + "grad_norm": 2.2507030963897705, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.699921190738678, + "num_tokens": 351572670.0, + "step": 13894 + }, + { + "epoch": 1.525916977816824, + "grad_norm": 2.1589818000793457, + "learning_rate": 1e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7118707895278931, + "num_tokens": 351599953.0, + "step": 13895 + }, + { + "epoch": 1.5260267955194378, + "grad_norm": 2.0659258365631104, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7081917524337769, + "num_tokens": 351628337.0, + "step": 13896 + }, + { + "epoch": 1.5261366132220515, + "grad_norm": 2.4469640254974365, + "learning_rate": 1e-06, + "loss": 0.8483, + "mean_token_accuracy": 0.7320649027824402, + "num_tokens": 351649684.0, + "step": 13897 + }, + { + "epoch": 1.526246430924665, + "grad_norm": 2.542616605758667, + "learning_rate": 1e-06, + "loss": 0.8692, + "mean_token_accuracy": 0.7258407473564148, + "num_tokens": 351670412.0, + "step": 13898 + }, + { + "epoch": 1.5263562486272786, + "grad_norm": 2.287121057510376, + "learning_rate": 1e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.7164605855941772, + "num_tokens": 351694132.0, + "step": 13899 + }, + { + "epoch": 1.5264660663298923, + "grad_norm": 2.5079891681671143, + "learning_rate": 1e-06, + "loss": 0.8362, + "mean_token_accuracy": 0.733370304107666, + "num_tokens": 351714734.0, + "step": 13900 + }, + { + "epoch": 1.526575884032506, + "grad_norm": 2.324662446975708, + "learning_rate": 1e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.7123307585716248, + "num_tokens": 351738896.0, + "step": 13901 + }, + { + "epoch": 1.5266857017351199, + "grad_norm": 2.3826286792755127, + "learning_rate": 1e-06, + "loss": 0.8176, + "mean_token_accuracy": 0.7443060874938965, + "num_tokens": 351761394.0, + "step": 13902 + }, + { + "epoch": 1.5267955194377334, + "grad_norm": 2.32625675201416, + "learning_rate": 1e-06, + "loss": 0.9189, + "mean_token_accuracy": 0.7145024538040161, + "num_tokens": 351786067.0, + "step": 13903 + }, + { + "epoch": 1.526905337140347, + "grad_norm": 2.157538890838623, + "learning_rate": 1e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.7220809459686279, + "num_tokens": 351812527.0, + "step": 13904 + }, + { + "epoch": 1.5270151548429607, + "grad_norm": 2.5724904537200928, + "learning_rate": 1e-06, + "loss": 0.937, + "mean_token_accuracy": 0.7163039445877075, + "num_tokens": 351833370.0, + "step": 13905 + }, + { + "epoch": 1.5271249725455744, + "grad_norm": 2.4182474613189697, + "learning_rate": 1e-06, + "loss": 0.8976, + "mean_token_accuracy": 0.7181766033172607, + "num_tokens": 351855977.0, + "step": 13906 + }, + { + "epoch": 1.527234790248188, + "grad_norm": 2.051769971847534, + "learning_rate": 1e-06, + "loss": 0.8619, + "mean_token_accuracy": 0.7306597232818604, + "num_tokens": 351885990.0, + "step": 13907 + }, + { + "epoch": 1.5273446079508015, + "grad_norm": 2.4923148155212402, + "learning_rate": 1e-06, + "loss": 0.8741, + "mean_token_accuracy": 0.7264370918273926, + "num_tokens": 351908297.0, + "step": 13908 + }, + { + "epoch": 1.5274544256534153, + "grad_norm": 2.3197696208953857, + "learning_rate": 1e-06, + "loss": 0.817, + "mean_token_accuracy": 0.7431463599205017, + "num_tokens": 351931955.0, + "step": 13909 + }, + { + "epoch": 1.527564243356029, + "grad_norm": 2.7052292823791504, + "learning_rate": 1e-06, + "loss": 0.8384, + "mean_token_accuracy": 0.7369773387908936, + "num_tokens": 351950849.0, + "step": 13910 + }, + { + "epoch": 1.5276740610586428, + "grad_norm": 2.420140266418457, + "learning_rate": 1e-06, + "loss": 0.8305, + "mean_token_accuracy": 0.735793948173523, + "num_tokens": 351973027.0, + "step": 13911 + }, + { + "epoch": 1.5277838787612563, + "grad_norm": 2.135047435760498, + "learning_rate": 1e-06, + "loss": 0.8735, + "mean_token_accuracy": 0.7243064045906067, + "num_tokens": 351999992.0, + "step": 13912 + }, + { + "epoch": 1.5278936964638699, + "grad_norm": 2.208000659942627, + "learning_rate": 1e-06, + "loss": 0.8882, + "mean_token_accuracy": 0.7132970690727234, + "num_tokens": 352026546.0, + "step": 13913 + }, + { + "epoch": 1.5280035141664836, + "grad_norm": 2.5670599937438965, + "learning_rate": 1e-06, + "loss": 0.8531, + "mean_token_accuracy": 0.7330994606018066, + "num_tokens": 352047571.0, + "step": 13914 + }, + { + "epoch": 1.5281133318690974, + "grad_norm": 2.1984024047851562, + "learning_rate": 1e-06, + "loss": 0.8874, + "mean_token_accuracy": 0.7252722978591919, + "num_tokens": 352073811.0, + "step": 13915 + }, + { + "epoch": 1.5282231495717111, + "grad_norm": 2.3245689868927, + "learning_rate": 1e-06, + "loss": 0.9317, + "mean_token_accuracy": 0.7092298269271851, + "num_tokens": 352099685.0, + "step": 13916 + }, + { + "epoch": 1.5283329672743247, + "grad_norm": 2.079122304916382, + "learning_rate": 1e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7116767168045044, + "num_tokens": 352128861.0, + "step": 13917 + }, + { + "epoch": 1.5284427849769382, + "grad_norm": 2.3801984786987305, + "learning_rate": 1e-06, + "loss": 0.8819, + "mean_token_accuracy": 0.7277501821517944, + "num_tokens": 352152350.0, + "step": 13918 + }, + { + "epoch": 1.528552602679552, + "grad_norm": 2.5610389709472656, + "learning_rate": 1e-06, + "loss": 0.8502, + "mean_token_accuracy": 0.743998646736145, + "num_tokens": 352173523.0, + "step": 13919 + }, + { + "epoch": 1.5286624203821657, + "grad_norm": 2.4489426612854004, + "learning_rate": 1e-06, + "loss": 0.8514, + "mean_token_accuracy": 0.7368287444114685, + "num_tokens": 352195064.0, + "step": 13920 + }, + { + "epoch": 1.5287722380847792, + "grad_norm": 1.822788119316101, + "learning_rate": 1e-06, + "loss": 0.954, + "mean_token_accuracy": 0.7053593397140503, + "num_tokens": 352232862.0, + "step": 13921 + }, + { + "epoch": 1.5288820557873928, + "grad_norm": 2.476316213607788, + "learning_rate": 1e-06, + "loss": 0.8573, + "mean_token_accuracy": 0.7300641536712646, + "num_tokens": 352254217.0, + "step": 13922 + }, + { + "epoch": 1.5289918734900065, + "grad_norm": 2.3921492099761963, + "learning_rate": 1e-06, + "loss": 0.7675, + "mean_token_accuracy": 0.7528343200683594, + "num_tokens": 352277588.0, + "step": 13923 + }, + { + "epoch": 1.5291016911926203, + "grad_norm": 2.20092511177063, + "learning_rate": 1e-06, + "loss": 0.9775, + "mean_token_accuracy": 0.701664924621582, + "num_tokens": 352305940.0, + "step": 13924 + }, + { + "epoch": 1.529211508895234, + "grad_norm": 2.620882034301758, + "learning_rate": 1e-06, + "loss": 0.8106, + "mean_token_accuracy": 0.7598093152046204, + "num_tokens": 352324245.0, + "step": 13925 + }, + { + "epoch": 1.5293213265978476, + "grad_norm": 2.1670591831207275, + "learning_rate": 1e-06, + "loss": 0.8011, + "mean_token_accuracy": 0.7495448589324951, + "num_tokens": 352348581.0, + "step": 13926 + }, + { + "epoch": 1.5294311443004611, + "grad_norm": 2.3140876293182373, + "learning_rate": 1e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.7211287021636963, + "num_tokens": 352373054.0, + "step": 13927 + }, + { + "epoch": 1.5295409620030749, + "grad_norm": 2.0610508918762207, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7165291905403137, + "num_tokens": 352402027.0, + "step": 13928 + }, + { + "epoch": 1.5296507797056886, + "grad_norm": 2.378796339035034, + "learning_rate": 1e-06, + "loss": 0.8117, + "mean_token_accuracy": 0.7357374429702759, + "num_tokens": 352425835.0, + "step": 13929 + }, + { + "epoch": 1.5297605974083022, + "grad_norm": 2.123335123062134, + "learning_rate": 1e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.7106777429580688, + "num_tokens": 352454303.0, + "step": 13930 + }, + { + "epoch": 1.529870415110916, + "grad_norm": 2.1502304077148438, + "learning_rate": 1e-06, + "loss": 0.8525, + "mean_token_accuracy": 0.7352554798126221, + "num_tokens": 352481272.0, + "step": 13931 + }, + { + "epoch": 1.5299802328135295, + "grad_norm": 2.357231616973877, + "learning_rate": 1e-06, + "loss": 0.8084, + "mean_token_accuracy": 0.7409896850585938, + "num_tokens": 352502905.0, + "step": 13932 + }, + { + "epoch": 1.5300900505161432, + "grad_norm": 2.4503960609436035, + "learning_rate": 1e-06, + "loss": 0.8518, + "mean_token_accuracy": 0.733599066734314, + "num_tokens": 352523277.0, + "step": 13933 + }, + { + "epoch": 1.530199868218757, + "grad_norm": 2.5119986534118652, + "learning_rate": 1e-06, + "loss": 0.8704, + "mean_token_accuracy": 0.7363133430480957, + "num_tokens": 352545824.0, + "step": 13934 + }, + { + "epoch": 1.5303096859213705, + "grad_norm": 2.6480960845947266, + "learning_rate": 1e-06, + "loss": 0.8664, + "mean_token_accuracy": 0.7302830815315247, + "num_tokens": 352567548.0, + "step": 13935 + }, + { + "epoch": 1.530419503623984, + "grad_norm": 2.225362777709961, + "learning_rate": 1e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.711034893989563, + "num_tokens": 352596392.0, + "step": 13936 + }, + { + "epoch": 1.5305293213265978, + "grad_norm": 2.2683587074279785, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7094817757606506, + "num_tokens": 352620965.0, + "step": 13937 + }, + { + "epoch": 1.5306391390292116, + "grad_norm": 2.2268433570861816, + "learning_rate": 1e-06, + "loss": 0.8215, + "mean_token_accuracy": 0.7409977912902832, + "num_tokens": 352645178.0, + "step": 13938 + }, + { + "epoch": 1.5307489567318253, + "grad_norm": 2.269017219543457, + "learning_rate": 1e-06, + "loss": 0.8825, + "mean_token_accuracy": 0.7230654358863831, + "num_tokens": 352669773.0, + "step": 13939 + }, + { + "epoch": 1.5308587744344389, + "grad_norm": 1.993714690208435, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.7159822583198547, + "num_tokens": 352700846.0, + "step": 13940 + }, + { + "epoch": 1.5309685921370524, + "grad_norm": 2.0746073722839355, + "learning_rate": 1e-06, + "loss": 1.0568, + "mean_token_accuracy": 0.6778067946434021, + "num_tokens": 352735043.0, + "step": 13941 + }, + { + "epoch": 1.5310784098396661, + "grad_norm": 2.514453887939453, + "learning_rate": 1e-06, + "loss": 0.91, + "mean_token_accuracy": 0.7118794918060303, + "num_tokens": 352756343.0, + "step": 13942 + }, + { + "epoch": 1.53118822754228, + "grad_norm": 2.185666561126709, + "learning_rate": 1e-06, + "loss": 0.8693, + "mean_token_accuracy": 0.7305912971496582, + "num_tokens": 352782665.0, + "step": 13943 + }, + { + "epoch": 1.5312980452448934, + "grad_norm": 2.090757131576538, + "learning_rate": 1e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.7064769268035889, + "num_tokens": 352812169.0, + "step": 13944 + }, + { + "epoch": 1.5314078629475072, + "grad_norm": 2.16902232170105, + "learning_rate": 1e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.7148453593254089, + "num_tokens": 352840871.0, + "step": 13945 + }, + { + "epoch": 1.5315176806501207, + "grad_norm": 2.4263877868652344, + "learning_rate": 1e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.7157262563705444, + "num_tokens": 352862691.0, + "step": 13946 + }, + { + "epoch": 1.5316274983527345, + "grad_norm": 2.0304148197174072, + "learning_rate": 1e-06, + "loss": 0.8602, + "mean_token_accuracy": 0.7301914691925049, + "num_tokens": 352892852.0, + "step": 13947 + }, + { + "epoch": 1.5317373160553482, + "grad_norm": 2.0755014419555664, + "learning_rate": 1e-06, + "loss": 0.8643, + "mean_token_accuracy": 0.7281693816184998, + "num_tokens": 352924558.0, + "step": 13948 + }, + { + "epoch": 1.5318471337579618, + "grad_norm": 2.172424554824829, + "learning_rate": 1e-06, + "loss": 0.9585, + "mean_token_accuracy": 0.7002176642417908, + "num_tokens": 352951469.0, + "step": 13949 + }, + { + "epoch": 1.5319569514605753, + "grad_norm": 2.279684066772461, + "learning_rate": 1e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.7173934578895569, + "num_tokens": 352976479.0, + "step": 13950 + }, + { + "epoch": 1.532066769163189, + "grad_norm": 2.3436777591705322, + "learning_rate": 1e-06, + "loss": 0.9001, + "mean_token_accuracy": 0.7192560434341431, + "num_tokens": 353000058.0, + "step": 13951 + }, + { + "epoch": 1.5321765868658028, + "grad_norm": 2.0037758350372314, + "learning_rate": 1e-06, + "loss": 1.0283, + "mean_token_accuracy": 0.6815977096557617, + "num_tokens": 353035076.0, + "step": 13952 + }, + { + "epoch": 1.5322864045684166, + "grad_norm": 2.3253605365753174, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7110670208930969, + "num_tokens": 353062093.0, + "step": 13953 + }, + { + "epoch": 1.5323962222710301, + "grad_norm": 2.302192449569702, + "learning_rate": 1e-06, + "loss": 0.8306, + "mean_token_accuracy": 0.7392947673797607, + "num_tokens": 353087059.0, + "step": 13954 + }, + { + "epoch": 1.5325060399736437, + "grad_norm": 2.5145931243896484, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7088183164596558, + "num_tokens": 353110819.0, + "step": 13955 + }, + { + "epoch": 1.5326158576762574, + "grad_norm": 2.23752760887146, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.7169978618621826, + "num_tokens": 353136540.0, + "step": 13956 + }, + { + "epoch": 1.5327256753788712, + "grad_norm": 2.187002182006836, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7128086686134338, + "num_tokens": 353164445.0, + "step": 13957 + }, + { + "epoch": 1.5328354930814847, + "grad_norm": 2.046747922897339, + "learning_rate": 1e-06, + "loss": 0.9082, + "mean_token_accuracy": 0.7101227045059204, + "num_tokens": 353194250.0, + "step": 13958 + }, + { + "epoch": 1.5329453107840982, + "grad_norm": 2.3234031200408936, + "learning_rate": 1e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.7234033346176147, + "num_tokens": 353217796.0, + "step": 13959 + }, + { + "epoch": 1.533055128486712, + "grad_norm": 2.1204593181610107, + "learning_rate": 1e-06, + "loss": 0.8632, + "mean_token_accuracy": 0.7264019846916199, + "num_tokens": 353245665.0, + "step": 13960 + }, + { + "epoch": 1.5331649461893258, + "grad_norm": 2.2244794368743896, + "learning_rate": 1e-06, + "loss": 0.8791, + "mean_token_accuracy": 0.7250556349754333, + "num_tokens": 353273285.0, + "step": 13961 + }, + { + "epoch": 1.5332747638919395, + "grad_norm": 2.102839231491089, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.6976878643035889, + "num_tokens": 353301094.0, + "step": 13962 + }, + { + "epoch": 1.533384581594553, + "grad_norm": 2.6046881675720215, + "learning_rate": 1e-06, + "loss": 0.8415, + "mean_token_accuracy": 0.733465313911438, + "num_tokens": 353322140.0, + "step": 13963 + }, + { + "epoch": 1.5334943992971666, + "grad_norm": 2.1205849647521973, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.6985675096511841, + "num_tokens": 353353705.0, + "step": 13964 + }, + { + "epoch": 1.5336042169997803, + "grad_norm": 2.584794521331787, + "learning_rate": 1e-06, + "loss": 0.9166, + "mean_token_accuracy": 0.7109390497207642, + "num_tokens": 353373468.0, + "step": 13965 + }, + { + "epoch": 1.533714034702394, + "grad_norm": 2.475451946258545, + "learning_rate": 1e-06, + "loss": 0.9274, + "mean_token_accuracy": 0.714322566986084, + "num_tokens": 353397164.0, + "step": 13966 + }, + { + "epoch": 1.5338238524050078, + "grad_norm": 2.412618398666382, + "learning_rate": 1e-06, + "loss": 1.0261, + "mean_token_accuracy": 0.6950341463088989, + "num_tokens": 353422528.0, + "step": 13967 + }, + { + "epoch": 1.5339336701076214, + "grad_norm": 2.5582735538482666, + "learning_rate": 1e-06, + "loss": 0.8734, + "mean_token_accuracy": 0.7281102538108826, + "num_tokens": 353443014.0, + "step": 13968 + }, + { + "epoch": 1.534043487810235, + "grad_norm": 2.4024980068206787, + "learning_rate": 1e-06, + "loss": 0.878, + "mean_token_accuracy": 0.7270967364311218, + "num_tokens": 353463953.0, + "step": 13969 + }, + { + "epoch": 1.5341533055128487, + "grad_norm": 2.0798468589782715, + "learning_rate": 1e-06, + "loss": 0.9612, + "mean_token_accuracy": 0.7016500234603882, + "num_tokens": 353493843.0, + "step": 13970 + }, + { + "epoch": 1.5342631232154624, + "grad_norm": 2.7846078872680664, + "learning_rate": 1e-06, + "loss": 0.871, + "mean_token_accuracy": 0.7262359857559204, + "num_tokens": 353512790.0, + "step": 13971 + }, + { + "epoch": 1.534372940918076, + "grad_norm": 2.113513231277466, + "learning_rate": 1e-06, + "loss": 0.9949, + "mean_token_accuracy": 0.6967047452926636, + "num_tokens": 353544887.0, + "step": 13972 + }, + { + "epoch": 1.5344827586206895, + "grad_norm": 2.8682262897491455, + "learning_rate": 1e-06, + "loss": 0.7728, + "mean_token_accuracy": 0.750351071357727, + "num_tokens": 353562476.0, + "step": 13973 + }, + { + "epoch": 1.5345925763233033, + "grad_norm": 1.994884967803955, + "learning_rate": 1e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.703133225440979, + "num_tokens": 353592988.0, + "step": 13974 + }, + { + "epoch": 1.534702394025917, + "grad_norm": 2.5373551845550537, + "learning_rate": 1e-06, + "loss": 0.8643, + "mean_token_accuracy": 0.7327044010162354, + "num_tokens": 353613102.0, + "step": 13975 + }, + { + "epoch": 1.5348122117285308, + "grad_norm": 2.3537228107452393, + "learning_rate": 1e-06, + "loss": 0.8798, + "mean_token_accuracy": 0.7236766219139099, + "num_tokens": 353635739.0, + "step": 13976 + }, + { + "epoch": 1.5349220294311443, + "grad_norm": 1.9749425649642944, + "learning_rate": 1e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.6926893591880798, + "num_tokens": 353667401.0, + "step": 13977 + }, + { + "epoch": 1.5350318471337578, + "grad_norm": 2.2969961166381836, + "learning_rate": 1e-06, + "loss": 0.8603, + "mean_token_accuracy": 0.7306962609291077, + "num_tokens": 353691266.0, + "step": 13978 + }, + { + "epoch": 1.5351416648363716, + "grad_norm": 2.3475427627563477, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7116016149520874, + "num_tokens": 353717116.0, + "step": 13979 + }, + { + "epoch": 1.5352514825389854, + "grad_norm": 2.430629253387451, + "learning_rate": 1e-06, + "loss": 0.8905, + "mean_token_accuracy": 0.7246809005737305, + "num_tokens": 353740067.0, + "step": 13980 + }, + { + "epoch": 1.535361300241599, + "grad_norm": 2.735276222229004, + "learning_rate": 1e-06, + "loss": 0.9242, + "mean_token_accuracy": 0.7275149822235107, + "num_tokens": 353759991.0, + "step": 13981 + }, + { + "epoch": 1.5354711179442126, + "grad_norm": 2.2578513622283936, + "learning_rate": 1e-06, + "loss": 0.8862, + "mean_token_accuracy": 0.7210952043533325, + "num_tokens": 353785228.0, + "step": 13982 + }, + { + "epoch": 1.5355809356468262, + "grad_norm": 2.3804681301116943, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.7137235403060913, + "num_tokens": 353810648.0, + "step": 13983 + }, + { + "epoch": 1.53569075334944, + "grad_norm": 2.155242919921875, + "learning_rate": 1e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.7080085873603821, + "num_tokens": 353839255.0, + "step": 13984 + }, + { + "epoch": 1.5358005710520537, + "grad_norm": 2.429753065109253, + "learning_rate": 1e-06, + "loss": 0.8781, + "mean_token_accuracy": 0.7256154417991638, + "num_tokens": 353860637.0, + "step": 13985 + }, + { + "epoch": 1.5359103887546672, + "grad_norm": 2.3477556705474854, + "learning_rate": 1e-06, + "loss": 0.972, + "mean_token_accuracy": 0.702828586101532, + "num_tokens": 353885582.0, + "step": 13986 + }, + { + "epoch": 1.5360202064572808, + "grad_norm": 2.246594190597534, + "learning_rate": 1e-06, + "loss": 0.9453, + "mean_token_accuracy": 0.7104074358940125, + "num_tokens": 353910534.0, + "step": 13987 + }, + { + "epoch": 1.5361300241598945, + "grad_norm": 2.4516077041625977, + "learning_rate": 1e-06, + "loss": 0.8531, + "mean_token_accuracy": 0.7351038455963135, + "num_tokens": 353932322.0, + "step": 13988 + }, + { + "epoch": 1.5362398418625083, + "grad_norm": 2.0148444175720215, + "learning_rate": 1e-06, + "loss": 0.9087, + "mean_token_accuracy": 0.7277742624282837, + "num_tokens": 353961280.0, + "step": 13989 + }, + { + "epoch": 1.536349659565122, + "grad_norm": 2.2909998893737793, + "learning_rate": 1e-06, + "loss": 0.9614, + "mean_token_accuracy": 0.7079849243164062, + "num_tokens": 353986601.0, + "step": 13990 + }, + { + "epoch": 1.5364594772677356, + "grad_norm": 2.173722982406616, + "learning_rate": 1e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.7157449126243591, + "num_tokens": 354013577.0, + "step": 13991 + }, + { + "epoch": 1.536569294970349, + "grad_norm": 2.2553000450134277, + "learning_rate": 1e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.6971660852432251, + "num_tokens": 354039236.0, + "step": 13992 + }, + { + "epoch": 1.5366791126729629, + "grad_norm": 2.4492974281311035, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.7115381360054016, + "num_tokens": 354062304.0, + "step": 13993 + }, + { + "epoch": 1.5367889303755766, + "grad_norm": 2.454704999923706, + "learning_rate": 1e-06, + "loss": 0.8659, + "mean_token_accuracy": 0.7293276786804199, + "num_tokens": 354084179.0, + "step": 13994 + }, + { + "epoch": 1.5368987480781902, + "grad_norm": 2.226067066192627, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.7047991156578064, + "num_tokens": 354110413.0, + "step": 13995 + }, + { + "epoch": 1.537008565780804, + "grad_norm": 2.2853353023529053, + "learning_rate": 1e-06, + "loss": 0.8297, + "mean_token_accuracy": 0.7374280691146851, + "num_tokens": 354132994.0, + "step": 13996 + }, + { + "epoch": 1.5371183834834174, + "grad_norm": 2.3657991886138916, + "learning_rate": 1e-06, + "loss": 0.8024, + "mean_token_accuracy": 0.7528572082519531, + "num_tokens": 354154799.0, + "step": 13997 + }, + { + "epoch": 1.5372282011860312, + "grad_norm": 1.9326754808425903, + "learning_rate": 1e-06, + "loss": 0.8077, + "mean_token_accuracy": 0.7436044812202454, + "num_tokens": 354187123.0, + "step": 13998 + }, + { + "epoch": 1.537338018888645, + "grad_norm": 2.2494821548461914, + "learning_rate": 1e-06, + "loss": 0.8615, + "mean_token_accuracy": 0.7251487374305725, + "num_tokens": 354212254.0, + "step": 13999 + }, + { + "epoch": 1.5374478365912585, + "grad_norm": 2.264225721359253, + "learning_rate": 1e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7046378254890442, + "num_tokens": 354235649.0, + "step": 14000 + }, + { + "epoch": 1.537557654293872, + "grad_norm": 2.6437840461730957, + "learning_rate": 1e-06, + "loss": 0.8948, + "mean_token_accuracy": 0.7198315262794495, + "num_tokens": 354255319.0, + "step": 14001 + }, + { + "epoch": 1.5376674719964858, + "grad_norm": 1.9702773094177246, + "learning_rate": 1e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.7152648568153381, + "num_tokens": 354285825.0, + "step": 14002 + }, + { + "epoch": 1.5377772896990995, + "grad_norm": 2.2581427097320557, + "learning_rate": 1e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.7211787700653076, + "num_tokens": 354311758.0, + "step": 14003 + }, + { + "epoch": 1.5378871074017133, + "grad_norm": 2.5555591583251953, + "learning_rate": 1e-06, + "loss": 0.9498, + "mean_token_accuracy": 0.7097156047821045, + "num_tokens": 354333144.0, + "step": 14004 + }, + { + "epoch": 1.5379969251043268, + "grad_norm": 2.3034627437591553, + "learning_rate": 1e-06, + "loss": 0.8572, + "mean_token_accuracy": 0.7312946915626526, + "num_tokens": 354357928.0, + "step": 14005 + }, + { + "epoch": 1.5381067428069404, + "grad_norm": 2.209449529647827, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.7020261287689209, + "num_tokens": 354385913.0, + "step": 14006 + }, + { + "epoch": 1.5382165605095541, + "grad_norm": 2.10951566696167, + "learning_rate": 1e-06, + "loss": 0.7964, + "mean_token_accuracy": 0.747524619102478, + "num_tokens": 354411427.0, + "step": 14007 + }, + { + "epoch": 1.5383263782121679, + "grad_norm": 2.4062893390655518, + "learning_rate": 1e-06, + "loss": 0.8679, + "mean_token_accuracy": 0.7271457314491272, + "num_tokens": 354433748.0, + "step": 14008 + }, + { + "epoch": 1.5384361959147814, + "grad_norm": 2.139522075653076, + "learning_rate": 1e-06, + "loss": 0.865, + "mean_token_accuracy": 0.7380608320236206, + "num_tokens": 354462006.0, + "step": 14009 + }, + { + "epoch": 1.538546013617395, + "grad_norm": 1.9572969675064087, + "learning_rate": 1e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.6879087090492249, + "num_tokens": 354494958.0, + "step": 14010 + }, + { + "epoch": 1.5386558313200087, + "grad_norm": 2.228442668914795, + "learning_rate": 1e-06, + "loss": 0.8743, + "mean_token_accuracy": 0.7304168939590454, + "num_tokens": 354519545.0, + "step": 14011 + }, + { + "epoch": 1.5387656490226225, + "grad_norm": 2.4794678688049316, + "learning_rate": 1e-06, + "loss": 0.8327, + "mean_token_accuracy": 0.7465824484825134, + "num_tokens": 354540042.0, + "step": 14012 + }, + { + "epoch": 1.5388754667252362, + "grad_norm": 2.3064627647399902, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7053462266921997, + "num_tokens": 354564212.0, + "step": 14013 + }, + { + "epoch": 1.5389852844278498, + "grad_norm": 2.081345319747925, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.7136028409004211, + "num_tokens": 354593891.0, + "step": 14014 + }, + { + "epoch": 1.5390951021304633, + "grad_norm": 2.540618896484375, + "learning_rate": 1e-06, + "loss": 0.7948, + "mean_token_accuracy": 0.7527308464050293, + "num_tokens": 354615864.0, + "step": 14015 + }, + { + "epoch": 1.539204919833077, + "grad_norm": 2.5197999477386475, + "learning_rate": 1e-06, + "loss": 0.8316, + "mean_token_accuracy": 0.7462597489356995, + "num_tokens": 354637738.0, + "step": 14016 + }, + { + "epoch": 1.5393147375356908, + "grad_norm": 2.334071636199951, + "learning_rate": 1e-06, + "loss": 0.896, + "mean_token_accuracy": 0.7246438264846802, + "num_tokens": 354661036.0, + "step": 14017 + }, + { + "epoch": 1.5394245552383046, + "grad_norm": 2.203963041305542, + "learning_rate": 1e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.7269294261932373, + "num_tokens": 354687132.0, + "step": 14018 + }, + { + "epoch": 1.539534372940918, + "grad_norm": 2.5930721759796143, + "learning_rate": 1e-06, + "loss": 0.7966, + "mean_token_accuracy": 0.7416161298751831, + "num_tokens": 354708094.0, + "step": 14019 + }, + { + "epoch": 1.5396441906435316, + "grad_norm": 2.210188388824463, + "learning_rate": 1e-06, + "loss": 0.9317, + "mean_token_accuracy": 0.7189106941223145, + "num_tokens": 354735149.0, + "step": 14020 + }, + { + "epoch": 1.5397540083461454, + "grad_norm": 2.4497814178466797, + "learning_rate": 1e-06, + "loss": 0.9782, + "mean_token_accuracy": 0.6957426071166992, + "num_tokens": 354758203.0, + "step": 14021 + }, + { + "epoch": 1.5398638260487592, + "grad_norm": 2.1375792026519775, + "learning_rate": 1e-06, + "loss": 0.9239, + "mean_token_accuracy": 0.7133506536483765, + "num_tokens": 354784426.0, + "step": 14022 + }, + { + "epoch": 1.5399736437513727, + "grad_norm": 2.174743175506592, + "learning_rate": 1e-06, + "loss": 0.8846, + "mean_token_accuracy": 0.7215062975883484, + "num_tokens": 354812965.0, + "step": 14023 + }, + { + "epoch": 1.5400834614539862, + "grad_norm": 2.706238269805908, + "learning_rate": 1e-06, + "loss": 0.8198, + "mean_token_accuracy": 0.7428078055381775, + "num_tokens": 354831492.0, + "step": 14024 + }, + { + "epoch": 1.5401932791566, + "grad_norm": 2.3633639812469482, + "learning_rate": 1e-06, + "loss": 0.8488, + "mean_token_accuracy": 0.7319305539131165, + "num_tokens": 354853561.0, + "step": 14025 + }, + { + "epoch": 1.5403030968592137, + "grad_norm": 1.8582361936569214, + "learning_rate": 1e-06, + "loss": 1.0475, + "mean_token_accuracy": 0.6850124597549438, + "num_tokens": 354889102.0, + "step": 14026 + }, + { + "epoch": 1.5404129145618275, + "grad_norm": 2.0672783851623535, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.7150237560272217, + "num_tokens": 354919887.0, + "step": 14027 + }, + { + "epoch": 1.540522732264441, + "grad_norm": 2.427499532699585, + "learning_rate": 1e-06, + "loss": 0.8912, + "mean_token_accuracy": 0.7224698066711426, + "num_tokens": 354941289.0, + "step": 14028 + }, + { + "epoch": 1.5406325499670546, + "grad_norm": 1.895593523979187, + "learning_rate": 1e-06, + "loss": 0.9532, + "mean_token_accuracy": 0.7170423865318298, + "num_tokens": 354975275.0, + "step": 14029 + }, + { + "epoch": 1.5407423676696683, + "grad_norm": 2.3398447036743164, + "learning_rate": 1e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7015448808670044, + "num_tokens": 354998462.0, + "step": 14030 + }, + { + "epoch": 1.540852185372282, + "grad_norm": 2.0337157249450684, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7074173092842102, + "num_tokens": 355028522.0, + "step": 14031 + }, + { + "epoch": 1.5409620030748958, + "grad_norm": 2.3465065956115723, + "learning_rate": 1e-06, + "loss": 0.8919, + "mean_token_accuracy": 0.7302106022834778, + "num_tokens": 355052372.0, + "step": 14032 + }, + { + "epoch": 1.5410718207775094, + "grad_norm": 2.387042760848999, + "learning_rate": 1e-06, + "loss": 0.8323, + "mean_token_accuracy": 0.7306874990463257, + "num_tokens": 355074921.0, + "step": 14033 + }, + { + "epoch": 1.541181638480123, + "grad_norm": 2.469886064529419, + "learning_rate": 1e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.7139816284179688, + "num_tokens": 355097185.0, + "step": 14034 + }, + { + "epoch": 1.5412914561827367, + "grad_norm": 2.3370041847229004, + "learning_rate": 1e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.7062572240829468, + "num_tokens": 355122439.0, + "step": 14035 + }, + { + "epoch": 1.5414012738853504, + "grad_norm": 2.3052220344543457, + "learning_rate": 1e-06, + "loss": 0.8082, + "mean_token_accuracy": 0.7440319061279297, + "num_tokens": 355147007.0, + "step": 14036 + }, + { + "epoch": 1.541511091587964, + "grad_norm": 1.9821178913116455, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.7039510011672974, + "num_tokens": 355179683.0, + "step": 14037 + }, + { + "epoch": 1.5416209092905775, + "grad_norm": 2.692103385925293, + "learning_rate": 1e-06, + "loss": 0.7918, + "mean_token_accuracy": 0.7438027858734131, + "num_tokens": 355198426.0, + "step": 14038 + }, + { + "epoch": 1.5417307269931912, + "grad_norm": 2.5062062740325928, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7065459489822388, + "num_tokens": 355219921.0, + "step": 14039 + }, + { + "epoch": 1.541840544695805, + "grad_norm": 2.805227041244507, + "learning_rate": 1e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.7188811302185059, + "num_tokens": 355238423.0, + "step": 14040 + }, + { + "epoch": 1.5419503623984188, + "grad_norm": 2.1134328842163086, + "learning_rate": 1e-06, + "loss": 0.868, + "mean_token_accuracy": 0.7267787456512451, + "num_tokens": 355268015.0, + "step": 14041 + }, + { + "epoch": 1.5420601801010323, + "grad_norm": 2.624917507171631, + "learning_rate": 1e-06, + "loss": 0.9217, + "mean_token_accuracy": 0.7144424915313721, + "num_tokens": 355290135.0, + "step": 14042 + }, + { + "epoch": 1.5421699978036458, + "grad_norm": 2.3393499851226807, + "learning_rate": 1e-06, + "loss": 0.8549, + "mean_token_accuracy": 0.7323911786079407, + "num_tokens": 355313140.0, + "step": 14043 + }, + { + "epoch": 1.5422798155062596, + "grad_norm": 2.079184055328369, + "learning_rate": 1e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.7048740386962891, + "num_tokens": 355342814.0, + "step": 14044 + }, + { + "epoch": 1.5423896332088733, + "grad_norm": 2.5144922733306885, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.7084119319915771, + "num_tokens": 355364429.0, + "step": 14045 + }, + { + "epoch": 1.5424994509114869, + "grad_norm": 2.307401657104492, + "learning_rate": 1e-06, + "loss": 0.9946, + "mean_token_accuracy": 0.7003554701805115, + "num_tokens": 355389870.0, + "step": 14046 + }, + { + "epoch": 1.5426092686141006, + "grad_norm": 2.100025177001953, + "learning_rate": 1e-06, + "loss": 0.922, + "mean_token_accuracy": 0.7140005826950073, + "num_tokens": 355417577.0, + "step": 14047 + }, + { + "epoch": 1.5427190863167142, + "grad_norm": 2.2505269050598145, + "learning_rate": 1e-06, + "loss": 0.9079, + "mean_token_accuracy": 0.7159221172332764, + "num_tokens": 355443530.0, + "step": 14048 + }, + { + "epoch": 1.542828904019328, + "grad_norm": 2.262690544128418, + "learning_rate": 1e-06, + "loss": 0.969, + "mean_token_accuracy": 0.7056757211685181, + "num_tokens": 355470114.0, + "step": 14049 + }, + { + "epoch": 1.5429387217219417, + "grad_norm": 2.415383815765381, + "learning_rate": 1e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.731735348701477, + "num_tokens": 355494396.0, + "step": 14050 + }, + { + "epoch": 1.5430485394245552, + "grad_norm": 2.0211141109466553, + "learning_rate": 1e-06, + "loss": 1.0187, + "mean_token_accuracy": 0.6840059757232666, + "num_tokens": 355527617.0, + "step": 14051 + }, + { + "epoch": 1.5431583571271688, + "grad_norm": 2.266317844390869, + "learning_rate": 1e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.7039656639099121, + "num_tokens": 355553815.0, + "step": 14052 + }, + { + "epoch": 1.5432681748297825, + "grad_norm": 2.216914176940918, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.6991243362426758, + "num_tokens": 355582568.0, + "step": 14053 + }, + { + "epoch": 1.5433779925323963, + "grad_norm": 2.221116781234741, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7057433128356934, + "num_tokens": 355608172.0, + "step": 14054 + }, + { + "epoch": 1.54348781023501, + "grad_norm": 2.6094143390655518, + "learning_rate": 1e-06, + "loss": 0.8506, + "mean_token_accuracy": 0.7421762943267822, + "num_tokens": 355628834.0, + "step": 14055 + }, + { + "epoch": 1.5435976279376236, + "grad_norm": 2.2308194637298584, + "learning_rate": 1e-06, + "loss": 0.9441, + "mean_token_accuracy": 0.7058367729187012, + "num_tokens": 355654810.0, + "step": 14056 + }, + { + "epoch": 1.543707445640237, + "grad_norm": 2.234163761138916, + "learning_rate": 1e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7142422795295715, + "num_tokens": 355683442.0, + "step": 14057 + }, + { + "epoch": 1.5438172633428509, + "grad_norm": 1.8891280889511108, + "learning_rate": 1e-06, + "loss": 0.9853, + "mean_token_accuracy": 0.7022889256477356, + "num_tokens": 355717356.0, + "step": 14058 + }, + { + "epoch": 1.5439270810454646, + "grad_norm": 2.6013638973236084, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.7112838625907898, + "num_tokens": 355738835.0, + "step": 14059 + }, + { + "epoch": 1.5440368987480781, + "grad_norm": 2.3190431594848633, + "learning_rate": 1e-06, + "loss": 0.9523, + "mean_token_accuracy": 0.7081554532051086, + "num_tokens": 355763123.0, + "step": 14060 + }, + { + "epoch": 1.544146716450692, + "grad_norm": 2.173672676086426, + "learning_rate": 1e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.707780122756958, + "num_tokens": 355789442.0, + "step": 14061 + }, + { + "epoch": 1.5442565341533054, + "grad_norm": 2.4305288791656494, + "learning_rate": 1e-06, + "loss": 0.7266, + "mean_token_accuracy": 0.7655208110809326, + "num_tokens": 355809680.0, + "step": 14062 + }, + { + "epoch": 1.5443663518559192, + "grad_norm": 2.316290855407715, + "learning_rate": 1e-06, + "loss": 0.824, + "mean_token_accuracy": 0.7422904372215271, + "num_tokens": 355833363.0, + "step": 14063 + }, + { + "epoch": 1.544476169558533, + "grad_norm": 2.229994297027588, + "learning_rate": 1e-06, + "loss": 0.8854, + "mean_token_accuracy": 0.7180692553520203, + "num_tokens": 355860255.0, + "step": 14064 + }, + { + "epoch": 1.5445859872611465, + "grad_norm": 2.294654130935669, + "learning_rate": 1e-06, + "loss": 0.8508, + "mean_token_accuracy": 0.7420884966850281, + "num_tokens": 355883320.0, + "step": 14065 + }, + { + "epoch": 1.54469580496376, + "grad_norm": 2.504920244216919, + "learning_rate": 1e-06, + "loss": 0.8659, + "mean_token_accuracy": 0.7240738272666931, + "num_tokens": 355903582.0, + "step": 14066 + }, + { + "epoch": 1.5448056226663738, + "grad_norm": 2.1941139698028564, + "learning_rate": 1e-06, + "loss": 0.7521, + "mean_token_accuracy": 0.7566555738449097, + "num_tokens": 355927091.0, + "step": 14067 + }, + { + "epoch": 1.5449154403689875, + "grad_norm": 2.3268725872039795, + "learning_rate": 1e-06, + "loss": 0.8715, + "mean_token_accuracy": 0.7271789908409119, + "num_tokens": 355951480.0, + "step": 14068 + }, + { + "epoch": 1.5450252580716013, + "grad_norm": 2.3726930618286133, + "learning_rate": 1e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.7022374868392944, + "num_tokens": 355975097.0, + "step": 14069 + }, + { + "epoch": 1.5451350757742148, + "grad_norm": 2.0314745903015137, + "learning_rate": 1e-06, + "loss": 0.9951, + "mean_token_accuracy": 0.6931079626083374, + "num_tokens": 356005834.0, + "step": 14070 + }, + { + "epoch": 1.5452448934768284, + "grad_norm": 2.457124710083008, + "learning_rate": 1e-06, + "loss": 0.8466, + "mean_token_accuracy": 0.7286795377731323, + "num_tokens": 356027241.0, + "step": 14071 + }, + { + "epoch": 1.5453547111794421, + "grad_norm": 1.9646453857421875, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7119946479797363, + "num_tokens": 356060434.0, + "step": 14072 + }, + { + "epoch": 1.5454645288820559, + "grad_norm": 2.232567071914673, + "learning_rate": 1e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.6997013092041016, + "num_tokens": 356089019.0, + "step": 14073 + }, + { + "epoch": 1.5455743465846694, + "grad_norm": 2.3913683891296387, + "learning_rate": 1e-06, + "loss": 0.849, + "mean_token_accuracy": 0.730984628200531, + "num_tokens": 356110035.0, + "step": 14074 + }, + { + "epoch": 1.545684164287283, + "grad_norm": 2.5840373039245605, + "learning_rate": 1e-06, + "loss": 0.8444, + "mean_token_accuracy": 0.7337368130683899, + "num_tokens": 356131030.0, + "step": 14075 + }, + { + "epoch": 1.5457939819898967, + "grad_norm": 2.136117458343506, + "learning_rate": 1e-06, + "loss": 0.952, + "mean_token_accuracy": 0.71292644739151, + "num_tokens": 356157998.0, + "step": 14076 + }, + { + "epoch": 1.5459037996925105, + "grad_norm": 2.4155404567718506, + "learning_rate": 1e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.7069968581199646, + "num_tokens": 356182210.0, + "step": 14077 + }, + { + "epoch": 1.5460136173951242, + "grad_norm": 2.3847780227661133, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7098040580749512, + "num_tokens": 356205235.0, + "step": 14078 + }, + { + "epoch": 1.5461234350977378, + "grad_norm": 2.1524970531463623, + "learning_rate": 1e-06, + "loss": 0.971, + "mean_token_accuracy": 0.6947700381278992, + "num_tokens": 356236187.0, + "step": 14079 + }, + { + "epoch": 1.5462332528003513, + "grad_norm": 2.455521583557129, + "learning_rate": 1e-06, + "loss": 0.8696, + "mean_token_accuracy": 0.734046459197998, + "num_tokens": 356258741.0, + "step": 14080 + }, + { + "epoch": 1.546343070502965, + "grad_norm": 2.287492513656616, + "learning_rate": 1e-06, + "loss": 0.8775, + "mean_token_accuracy": 0.7250835299491882, + "num_tokens": 356285115.0, + "step": 14081 + }, + { + "epoch": 1.5464528882055788, + "grad_norm": 2.5016493797302246, + "learning_rate": 1e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.7147669196128845, + "num_tokens": 356307150.0, + "step": 14082 + }, + { + "epoch": 1.5465627059081926, + "grad_norm": 2.2050745487213135, + "learning_rate": 1e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.7052633166313171, + "num_tokens": 356335870.0, + "step": 14083 + }, + { + "epoch": 1.546672523610806, + "grad_norm": 2.483402967453003, + "learning_rate": 1e-06, + "loss": 0.9107, + "mean_token_accuracy": 0.7174594402313232, + "num_tokens": 356358646.0, + "step": 14084 + }, + { + "epoch": 1.5467823413134196, + "grad_norm": 2.0813071727752686, + "learning_rate": 1e-06, + "loss": 0.9002, + "mean_token_accuracy": 0.720374584197998, + "num_tokens": 356385940.0, + "step": 14085 + }, + { + "epoch": 1.5468921590160334, + "grad_norm": 2.2412896156311035, + "learning_rate": 1e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.7145655751228333, + "num_tokens": 356410523.0, + "step": 14086 + }, + { + "epoch": 1.5470019767186471, + "grad_norm": 2.2168819904327393, + "learning_rate": 1e-06, + "loss": 0.8571, + "mean_token_accuracy": 0.7257827520370483, + "num_tokens": 356436777.0, + "step": 14087 + }, + { + "epoch": 1.5471117944212607, + "grad_norm": 2.4936583042144775, + "learning_rate": 1e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.7217676639556885, + "num_tokens": 356459476.0, + "step": 14088 + }, + { + "epoch": 1.5472216121238742, + "grad_norm": 2.7655158042907715, + "learning_rate": 1e-06, + "loss": 0.8881, + "mean_token_accuracy": 0.728786826133728, + "num_tokens": 356478773.0, + "step": 14089 + }, + { + "epoch": 1.547331429826488, + "grad_norm": 2.490816354751587, + "learning_rate": 1e-06, + "loss": 0.8516, + "mean_token_accuracy": 0.7354053258895874, + "num_tokens": 356499816.0, + "step": 14090 + }, + { + "epoch": 1.5474412475291017, + "grad_norm": 2.234139919281006, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7120713591575623, + "num_tokens": 356526098.0, + "step": 14091 + }, + { + "epoch": 1.5475510652317155, + "grad_norm": 2.552117347717285, + "learning_rate": 1e-06, + "loss": 0.8588, + "mean_token_accuracy": 0.7291861772537231, + "num_tokens": 356546668.0, + "step": 14092 + }, + { + "epoch": 1.547660882934329, + "grad_norm": 2.4357409477233887, + "learning_rate": 1e-06, + "loss": 0.8756, + "mean_token_accuracy": 0.7327771186828613, + "num_tokens": 356568984.0, + "step": 14093 + }, + { + "epoch": 1.5477707006369426, + "grad_norm": 2.386960744857788, + "learning_rate": 1e-06, + "loss": 0.8627, + "mean_token_accuracy": 0.7251068353652954, + "num_tokens": 356591292.0, + "step": 14094 + }, + { + "epoch": 1.5478805183395563, + "grad_norm": 2.303255319595337, + "learning_rate": 1e-06, + "loss": 0.8861, + "mean_token_accuracy": 0.7258549928665161, + "num_tokens": 356614531.0, + "step": 14095 + }, + { + "epoch": 1.54799033604217, + "grad_norm": 2.3135933876037598, + "learning_rate": 1e-06, + "loss": 0.89, + "mean_token_accuracy": 0.7256319522857666, + "num_tokens": 356640325.0, + "step": 14096 + }, + { + "epoch": 1.5481001537447838, + "grad_norm": 2.0770599842071533, + "learning_rate": 1e-06, + "loss": 0.8587, + "mean_token_accuracy": 0.7309252619743347, + "num_tokens": 356668062.0, + "step": 14097 + }, + { + "epoch": 1.5482099714473974, + "grad_norm": 2.4660863876342773, + "learning_rate": 1e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.715138852596283, + "num_tokens": 356690522.0, + "step": 14098 + }, + { + "epoch": 1.548319789150011, + "grad_norm": 2.122157096862793, + "learning_rate": 1e-06, + "loss": 0.8711, + "mean_token_accuracy": 0.7234141230583191, + "num_tokens": 356718956.0, + "step": 14099 + }, + { + "epoch": 1.5484296068526247, + "grad_norm": 1.989141583442688, + "learning_rate": 1e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.704663872718811, + "num_tokens": 356751228.0, + "step": 14100 + }, + { + "epoch": 1.5485394245552384, + "grad_norm": 2.3844714164733887, + "learning_rate": 1e-06, + "loss": 0.8736, + "mean_token_accuracy": 0.7261164784431458, + "num_tokens": 356774019.0, + "step": 14101 + }, + { + "epoch": 1.548649242257852, + "grad_norm": 2.155735492706299, + "learning_rate": 1e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.697636067867279, + "num_tokens": 356802708.0, + "step": 14102 + }, + { + "epoch": 1.5487590599604655, + "grad_norm": 2.2053844928741455, + "learning_rate": 1e-06, + "loss": 0.8326, + "mean_token_accuracy": 0.739345908164978, + "num_tokens": 356828043.0, + "step": 14103 + }, + { + "epoch": 1.5488688776630792, + "grad_norm": 2.148430109024048, + "learning_rate": 1e-06, + "loss": 0.9172, + "mean_token_accuracy": 0.7174173593521118, + "num_tokens": 356855227.0, + "step": 14104 + }, + { + "epoch": 1.548978695365693, + "grad_norm": 2.035013437271118, + "learning_rate": 1e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.6994473338127136, + "num_tokens": 356885908.0, + "step": 14105 + }, + { + "epoch": 1.5490885130683067, + "grad_norm": 2.1752400398254395, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.6972997188568115, + "num_tokens": 356913677.0, + "step": 14106 + }, + { + "epoch": 1.5491983307709203, + "grad_norm": 2.359706401824951, + "learning_rate": 1e-06, + "loss": 0.94, + "mean_token_accuracy": 0.7222259640693665, + "num_tokens": 356936420.0, + "step": 14107 + }, + { + "epoch": 1.5493081484735338, + "grad_norm": 2.4508118629455566, + "learning_rate": 1e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.7193880677223206, + "num_tokens": 356959707.0, + "step": 14108 + }, + { + "epoch": 1.5494179661761476, + "grad_norm": 2.5076775550842285, + "learning_rate": 1e-06, + "loss": 0.918, + "mean_token_accuracy": 0.7176181077957153, + "num_tokens": 356980026.0, + "step": 14109 + }, + { + "epoch": 1.5495277838787613, + "grad_norm": 2.360351085662842, + "learning_rate": 1e-06, + "loss": 0.8876, + "mean_token_accuracy": 0.7226521968841553, + "num_tokens": 357006748.0, + "step": 14110 + }, + { + "epoch": 1.5496376015813749, + "grad_norm": 2.6540098190307617, + "learning_rate": 1e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.7276917695999146, + "num_tokens": 357027176.0, + "step": 14111 + }, + { + "epoch": 1.5497474192839886, + "grad_norm": 2.2099368572235107, + "learning_rate": 1e-06, + "loss": 0.9992, + "mean_token_accuracy": 0.6936597228050232, + "num_tokens": 357055453.0, + "step": 14112 + }, + { + "epoch": 1.5498572369866022, + "grad_norm": 1.9963146448135376, + "learning_rate": 1e-06, + "loss": 0.9048, + "mean_token_accuracy": 0.7242516279220581, + "num_tokens": 357088193.0, + "step": 14113 + }, + { + "epoch": 1.549967054689216, + "grad_norm": 2.5175282955169678, + "learning_rate": 1e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.7001436948776245, + "num_tokens": 357109763.0, + "step": 14114 + }, + { + "epoch": 1.5500768723918297, + "grad_norm": 2.408357620239258, + "learning_rate": 1e-06, + "loss": 0.8427, + "mean_token_accuracy": 0.730199933052063, + "num_tokens": 357131488.0, + "step": 14115 + }, + { + "epoch": 1.5501866900944432, + "grad_norm": 2.2498769760131836, + "learning_rate": 1e-06, + "loss": 0.87, + "mean_token_accuracy": 0.7232682108879089, + "num_tokens": 357157194.0, + "step": 14116 + }, + { + "epoch": 1.5502965077970567, + "grad_norm": 2.7096219062805176, + "learning_rate": 1e-06, + "loss": 0.897, + "mean_token_accuracy": 0.7245826125144958, + "num_tokens": 357176815.0, + "step": 14117 + }, + { + "epoch": 1.5504063254996705, + "grad_norm": 2.2335424423217773, + "learning_rate": 1e-06, + "loss": 0.7947, + "mean_token_accuracy": 0.7443848848342896, + "num_tokens": 357200560.0, + "step": 14118 + }, + { + "epoch": 1.5505161432022843, + "grad_norm": 2.4623334407806396, + "learning_rate": 1e-06, + "loss": 0.8238, + "mean_token_accuracy": 0.735855221748352, + "num_tokens": 357220540.0, + "step": 14119 + }, + { + "epoch": 1.550625960904898, + "grad_norm": 2.2828683853149414, + "learning_rate": 1e-06, + "loss": 0.8368, + "mean_token_accuracy": 0.7359917163848877, + "num_tokens": 357245099.0, + "step": 14120 + }, + { + "epoch": 1.5507357786075116, + "grad_norm": 2.4657299518585205, + "learning_rate": 1e-06, + "loss": 0.8511, + "mean_token_accuracy": 0.7260040044784546, + "num_tokens": 357267030.0, + "step": 14121 + }, + { + "epoch": 1.550845596310125, + "grad_norm": 2.011467456817627, + "learning_rate": 1e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.7231131792068481, + "num_tokens": 357298301.0, + "step": 14122 + }, + { + "epoch": 1.5509554140127388, + "grad_norm": 2.2032887935638428, + "learning_rate": 1e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.7087051272392273, + "num_tokens": 357325698.0, + "step": 14123 + }, + { + "epoch": 1.5510652317153526, + "grad_norm": 2.3637142181396484, + "learning_rate": 1e-06, + "loss": 0.886, + "mean_token_accuracy": 0.7208191156387329, + "num_tokens": 357350385.0, + "step": 14124 + }, + { + "epoch": 1.5511750494179661, + "grad_norm": 2.117692470550537, + "learning_rate": 1e-06, + "loss": 0.8633, + "mean_token_accuracy": 0.7283979654312134, + "num_tokens": 357377853.0, + "step": 14125 + }, + { + "epoch": 1.55128486712058, + "grad_norm": 2.22352933883667, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.7068367004394531, + "num_tokens": 357405046.0, + "step": 14126 + }, + { + "epoch": 1.5513946848231934, + "grad_norm": 2.4047317504882812, + "learning_rate": 1e-06, + "loss": 0.8342, + "mean_token_accuracy": 0.7494902610778809, + "num_tokens": 357426482.0, + "step": 14127 + }, + { + "epoch": 1.5515045025258072, + "grad_norm": 2.2284436225891113, + "learning_rate": 1e-06, + "loss": 0.8357, + "mean_token_accuracy": 0.7407841682434082, + "num_tokens": 357450980.0, + "step": 14128 + }, + { + "epoch": 1.551614320228421, + "grad_norm": 2.298135757446289, + "learning_rate": 1e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.7140692472457886, + "num_tokens": 357475884.0, + "step": 14129 + }, + { + "epoch": 1.5517241379310345, + "grad_norm": 2.393448829650879, + "learning_rate": 1e-06, + "loss": 0.796, + "mean_token_accuracy": 0.7399245500564575, + "num_tokens": 357499070.0, + "step": 14130 + }, + { + "epoch": 1.551833955633648, + "grad_norm": 2.091261625289917, + "learning_rate": 1e-06, + "loss": 0.8767, + "mean_token_accuracy": 0.7241470813751221, + "num_tokens": 357526082.0, + "step": 14131 + }, + { + "epoch": 1.5519437733362618, + "grad_norm": 2.3198235034942627, + "learning_rate": 1e-06, + "loss": 0.9229, + "mean_token_accuracy": 0.7216883897781372, + "num_tokens": 357553288.0, + "step": 14132 + }, + { + "epoch": 1.5520535910388755, + "grad_norm": 2.3015904426574707, + "learning_rate": 1e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7300236225128174, + "num_tokens": 357577890.0, + "step": 14133 + }, + { + "epoch": 1.5521634087414893, + "grad_norm": 2.7295260429382324, + "learning_rate": 1e-06, + "loss": 0.8364, + "mean_token_accuracy": 0.7434942722320557, + "num_tokens": 357597232.0, + "step": 14134 + }, + { + "epoch": 1.5522732264441028, + "grad_norm": 2.0251927375793457, + "learning_rate": 1e-06, + "loss": 0.8768, + "mean_token_accuracy": 0.7296333312988281, + "num_tokens": 357628367.0, + "step": 14135 + }, + { + "epoch": 1.5523830441467164, + "grad_norm": 2.201443910598755, + "learning_rate": 1e-06, + "loss": 0.9227, + "mean_token_accuracy": 0.7185091972351074, + "num_tokens": 357654575.0, + "step": 14136 + }, + { + "epoch": 1.55249286184933, + "grad_norm": 1.951087474822998, + "learning_rate": 1e-06, + "loss": 0.8518, + "mean_token_accuracy": 0.7407940030097961, + "num_tokens": 357684977.0, + "step": 14137 + }, + { + "epoch": 1.5526026795519439, + "grad_norm": 2.372924327850342, + "learning_rate": 1e-06, + "loss": 0.9801, + "mean_token_accuracy": 0.6973558664321899, + "num_tokens": 357709128.0, + "step": 14138 + }, + { + "epoch": 1.5527124972545574, + "grad_norm": 2.319761037826538, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.709391176700592, + "num_tokens": 357733649.0, + "step": 14139 + }, + { + "epoch": 1.552822314957171, + "grad_norm": 2.4314868450164795, + "learning_rate": 1e-06, + "loss": 0.8984, + "mean_token_accuracy": 0.7251805663108826, + "num_tokens": 357758027.0, + "step": 14140 + }, + { + "epoch": 1.5529321326597847, + "grad_norm": 2.3174893856048584, + "learning_rate": 1e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.7152163982391357, + "num_tokens": 357781307.0, + "step": 14141 + }, + { + "epoch": 1.5530419503623984, + "grad_norm": 2.0610876083374023, + "learning_rate": 1e-06, + "loss": 0.959, + "mean_token_accuracy": 0.7060843706130981, + "num_tokens": 357813947.0, + "step": 14142 + }, + { + "epoch": 1.5531517680650122, + "grad_norm": 2.400158405303955, + "learning_rate": 1e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.7210798263549805, + "num_tokens": 357836327.0, + "step": 14143 + }, + { + "epoch": 1.5532615857676257, + "grad_norm": 2.561154365539551, + "learning_rate": 1e-06, + "loss": 0.8664, + "mean_token_accuracy": 0.7215498685836792, + "num_tokens": 357856717.0, + "step": 14144 + }, + { + "epoch": 1.5533714034702393, + "grad_norm": 2.108867645263672, + "learning_rate": 1e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.7103108167648315, + "num_tokens": 357883272.0, + "step": 14145 + }, + { + "epoch": 1.553481221172853, + "grad_norm": 2.159670352935791, + "learning_rate": 1e-06, + "loss": 0.879, + "mean_token_accuracy": 0.7314443588256836, + "num_tokens": 357909343.0, + "step": 14146 + }, + { + "epoch": 1.5535910388754668, + "grad_norm": 2.3169126510620117, + "learning_rate": 1e-06, + "loss": 0.9192, + "mean_token_accuracy": 0.7107141017913818, + "num_tokens": 357934439.0, + "step": 14147 + }, + { + "epoch": 1.5537008565780805, + "grad_norm": 2.448063373565674, + "learning_rate": 1e-06, + "loss": 0.8452, + "mean_token_accuracy": 0.738593339920044, + "num_tokens": 357957835.0, + "step": 14148 + }, + { + "epoch": 1.553810674280694, + "grad_norm": 2.324892997741699, + "learning_rate": 1e-06, + "loss": 0.8687, + "mean_token_accuracy": 0.7260953187942505, + "num_tokens": 357980807.0, + "step": 14149 + }, + { + "epoch": 1.5539204919833076, + "grad_norm": 2.2920267581939697, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.7036823630332947, + "num_tokens": 358003930.0, + "step": 14150 + }, + { + "epoch": 1.5540303096859214, + "grad_norm": 2.4951725006103516, + "learning_rate": 1e-06, + "loss": 0.8247, + "mean_token_accuracy": 0.7472292184829712, + "num_tokens": 358025331.0, + "step": 14151 + }, + { + "epoch": 1.5541401273885351, + "grad_norm": 2.2749667167663574, + "learning_rate": 1e-06, + "loss": 0.8992, + "mean_token_accuracy": 0.7219845056533813, + "num_tokens": 358050990.0, + "step": 14152 + }, + { + "epoch": 1.5542499450911487, + "grad_norm": 2.070875406265259, + "learning_rate": 1e-06, + "loss": 0.92, + "mean_token_accuracy": 0.720291018486023, + "num_tokens": 358079719.0, + "step": 14153 + }, + { + "epoch": 1.5543597627937622, + "grad_norm": 2.4186370372772217, + "learning_rate": 1e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.7218480110168457, + "num_tokens": 358102589.0, + "step": 14154 + }, + { + "epoch": 1.554469580496376, + "grad_norm": 2.1957921981811523, + "learning_rate": 1e-06, + "loss": 0.8475, + "mean_token_accuracy": 0.7323648929595947, + "num_tokens": 358129874.0, + "step": 14155 + }, + { + "epoch": 1.5545793981989897, + "grad_norm": 2.236344337463379, + "learning_rate": 1e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.7012902498245239, + "num_tokens": 358161449.0, + "step": 14156 + }, + { + "epoch": 1.5546892159016035, + "grad_norm": 2.245938301086426, + "learning_rate": 1e-06, + "loss": 0.8921, + "mean_token_accuracy": 0.7292630076408386, + "num_tokens": 358189763.0, + "step": 14157 + }, + { + "epoch": 1.554799033604217, + "grad_norm": 2.0746145248413086, + "learning_rate": 1e-06, + "loss": 0.9386, + "mean_token_accuracy": 0.7085403800010681, + "num_tokens": 358220276.0, + "step": 14158 + }, + { + "epoch": 1.5549088513068305, + "grad_norm": 2.0142593383789062, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.7066237926483154, + "num_tokens": 358250178.0, + "step": 14159 + }, + { + "epoch": 1.5550186690094443, + "grad_norm": 2.1717946529388428, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.7004506587982178, + "num_tokens": 358280210.0, + "step": 14160 + }, + { + "epoch": 1.555128486712058, + "grad_norm": 2.251732587814331, + "learning_rate": 1e-06, + "loss": 0.8968, + "mean_token_accuracy": 0.7191612720489502, + "num_tokens": 358308080.0, + "step": 14161 + }, + { + "epoch": 1.5552383044146716, + "grad_norm": 2.4812514781951904, + "learning_rate": 1e-06, + "loss": 1.0, + "mean_token_accuracy": 0.6966450810432434, + "num_tokens": 358331626.0, + "step": 14162 + }, + { + "epoch": 1.5553481221172853, + "grad_norm": 2.292508363723755, + "learning_rate": 1e-06, + "loss": 0.984, + "mean_token_accuracy": 0.6992694139480591, + "num_tokens": 358354771.0, + "step": 14163 + }, + { + "epoch": 1.5554579398198989, + "grad_norm": 2.5422072410583496, + "learning_rate": 1e-06, + "loss": 0.8384, + "mean_token_accuracy": 0.7334972620010376, + "num_tokens": 358375136.0, + "step": 14164 + }, + { + "epoch": 1.5555677575225126, + "grad_norm": 2.2509925365448, + "learning_rate": 1e-06, + "loss": 0.8897, + "mean_token_accuracy": 0.7233139276504517, + "num_tokens": 358399724.0, + "step": 14165 + }, + { + "epoch": 1.5556775752251264, + "grad_norm": 2.2743136882781982, + "learning_rate": 1e-06, + "loss": 0.9487, + "mean_token_accuracy": 0.7058137655258179, + "num_tokens": 358424298.0, + "step": 14166 + }, + { + "epoch": 1.55578739292774, + "grad_norm": 2.1066300868988037, + "learning_rate": 1e-06, + "loss": 0.9062, + "mean_token_accuracy": 0.7153524160385132, + "num_tokens": 358453452.0, + "step": 14167 + }, + { + "epoch": 1.5558972106303535, + "grad_norm": 2.207066297531128, + "learning_rate": 1e-06, + "loss": 0.9473, + "mean_token_accuracy": 0.7047515511512756, + "num_tokens": 358479969.0, + "step": 14168 + }, + { + "epoch": 1.5560070283329672, + "grad_norm": 2.2640132904052734, + "learning_rate": 1e-06, + "loss": 0.8739, + "mean_token_accuracy": 0.7216642498970032, + "num_tokens": 358504466.0, + "step": 14169 + }, + { + "epoch": 1.556116846035581, + "grad_norm": 2.162496328353882, + "learning_rate": 1e-06, + "loss": 0.9132, + "mean_token_accuracy": 0.7177481651306152, + "num_tokens": 358532493.0, + "step": 14170 + }, + { + "epoch": 1.5562266637381947, + "grad_norm": 2.234224319458008, + "learning_rate": 1e-06, + "loss": 0.9135, + "mean_token_accuracy": 0.7162603139877319, + "num_tokens": 358557982.0, + "step": 14171 + }, + { + "epoch": 1.5563364814408083, + "grad_norm": 2.5553197860717773, + "learning_rate": 1e-06, + "loss": 0.8102, + "mean_token_accuracy": 0.7429301738739014, + "num_tokens": 358578229.0, + "step": 14172 + }, + { + "epoch": 1.5564462991434218, + "grad_norm": 2.3640120029449463, + "learning_rate": 1e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7128095626831055, + "num_tokens": 358602628.0, + "step": 14173 + }, + { + "epoch": 1.5565561168460356, + "grad_norm": 2.4807183742523193, + "learning_rate": 1e-06, + "loss": 0.8711, + "mean_token_accuracy": 0.7250657081604004, + "num_tokens": 358624892.0, + "step": 14174 + }, + { + "epoch": 1.5566659345486493, + "grad_norm": 2.1958839893341064, + "learning_rate": 1e-06, + "loss": 0.9441, + "mean_token_accuracy": 0.7052348852157593, + "num_tokens": 358653083.0, + "step": 14175 + }, + { + "epoch": 1.5567757522512629, + "grad_norm": 2.785686731338501, + "learning_rate": 1e-06, + "loss": 0.922, + "mean_token_accuracy": 0.7095479965209961, + "num_tokens": 358672274.0, + "step": 14176 + }, + { + "epoch": 1.5568855699538766, + "grad_norm": 2.562962293624878, + "learning_rate": 1e-06, + "loss": 0.8502, + "mean_token_accuracy": 0.7345387935638428, + "num_tokens": 358693578.0, + "step": 14177 + }, + { + "epoch": 1.5569953876564901, + "grad_norm": 2.5238986015319824, + "learning_rate": 1e-06, + "loss": 0.8823, + "mean_token_accuracy": 0.7223920226097107, + "num_tokens": 358713485.0, + "step": 14178 + }, + { + "epoch": 1.557105205359104, + "grad_norm": 2.2755260467529297, + "learning_rate": 1e-06, + "loss": 0.892, + "mean_token_accuracy": 0.7241541147232056, + "num_tokens": 358738265.0, + "step": 14179 + }, + { + "epoch": 1.5572150230617177, + "grad_norm": 2.108332633972168, + "learning_rate": 1e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.7067044973373413, + "num_tokens": 358764429.0, + "step": 14180 + }, + { + "epoch": 1.5573248407643312, + "grad_norm": 2.2621171474456787, + "learning_rate": 1e-06, + "loss": 0.853, + "mean_token_accuracy": 0.7343363761901855, + "num_tokens": 358787819.0, + "step": 14181 + }, + { + "epoch": 1.5574346584669447, + "grad_norm": 2.227858304977417, + "learning_rate": 1e-06, + "loss": 0.8911, + "mean_token_accuracy": 0.7274845838546753, + "num_tokens": 358812729.0, + "step": 14182 + }, + { + "epoch": 1.5575444761695585, + "grad_norm": 2.3358261585235596, + "learning_rate": 1e-06, + "loss": 0.7705, + "mean_token_accuracy": 0.7537369728088379, + "num_tokens": 358833594.0, + "step": 14183 + }, + { + "epoch": 1.5576542938721722, + "grad_norm": 2.477630853652954, + "learning_rate": 1e-06, + "loss": 0.8923, + "mean_token_accuracy": 0.7210705280303955, + "num_tokens": 358855848.0, + "step": 14184 + }, + { + "epoch": 1.557764111574786, + "grad_norm": 2.319822311401367, + "learning_rate": 1e-06, + "loss": 0.8612, + "mean_token_accuracy": 0.7269504070281982, + "num_tokens": 358881496.0, + "step": 14185 + }, + { + "epoch": 1.5578739292773995, + "grad_norm": 2.291012763977051, + "learning_rate": 1e-06, + "loss": 0.8621, + "mean_token_accuracy": 0.7219717502593994, + "num_tokens": 358905333.0, + "step": 14186 + }, + { + "epoch": 1.557983746980013, + "grad_norm": 2.3869881629943848, + "learning_rate": 1e-06, + "loss": 0.9824, + "mean_token_accuracy": 0.7056070566177368, + "num_tokens": 358929950.0, + "step": 14187 + }, + { + "epoch": 1.5580935646826268, + "grad_norm": 2.198422908782959, + "learning_rate": 1e-06, + "loss": 0.8808, + "mean_token_accuracy": 0.7295367121696472, + "num_tokens": 358959813.0, + "step": 14188 + }, + { + "epoch": 1.5582033823852406, + "grad_norm": 1.9432523250579834, + "learning_rate": 1e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.7053903341293335, + "num_tokens": 358991909.0, + "step": 14189 + }, + { + "epoch": 1.5583132000878541, + "grad_norm": 2.3313307762145996, + "learning_rate": 1e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.6982986927032471, + "num_tokens": 359018300.0, + "step": 14190 + }, + { + "epoch": 1.5584230177904677, + "grad_norm": 2.3030989170074463, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7186093926429749, + "num_tokens": 359044951.0, + "step": 14191 + }, + { + "epoch": 1.5585328354930814, + "grad_norm": 2.212794542312622, + "learning_rate": 1e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7134208679199219, + "num_tokens": 359074066.0, + "step": 14192 + }, + { + "epoch": 1.5586426531956952, + "grad_norm": 2.3540148735046387, + "learning_rate": 1e-06, + "loss": 0.8224, + "mean_token_accuracy": 0.7377309799194336, + "num_tokens": 359096414.0, + "step": 14193 + }, + { + "epoch": 1.558752470898309, + "grad_norm": 2.101330041885376, + "learning_rate": 1e-06, + "loss": 0.8928, + "mean_token_accuracy": 0.7371546626091003, + "num_tokens": 359121807.0, + "step": 14194 + }, + { + "epoch": 1.5588622886009225, + "grad_norm": 2.4010677337646484, + "learning_rate": 1e-06, + "loss": 0.7614, + "mean_token_accuracy": 0.7530192136764526, + "num_tokens": 359145094.0, + "step": 14195 + }, + { + "epoch": 1.558972106303536, + "grad_norm": 2.7812390327453613, + "learning_rate": 1e-06, + "loss": 0.8492, + "mean_token_accuracy": 0.7317364811897278, + "num_tokens": 359164372.0, + "step": 14196 + }, + { + "epoch": 1.5590819240061498, + "grad_norm": 2.4102749824523926, + "learning_rate": 1e-06, + "loss": 0.9882, + "mean_token_accuracy": 0.7114275097846985, + "num_tokens": 359189905.0, + "step": 14197 + }, + { + "epoch": 1.5591917417087635, + "grad_norm": 2.1911160945892334, + "learning_rate": 1e-06, + "loss": 0.8364, + "mean_token_accuracy": 0.7348282337188721, + "num_tokens": 359216672.0, + "step": 14198 + }, + { + "epoch": 1.5593015594113773, + "grad_norm": 2.175240993499756, + "learning_rate": 1e-06, + "loss": 0.8855, + "mean_token_accuracy": 0.73171466588974, + "num_tokens": 359241782.0, + "step": 14199 + }, + { + "epoch": 1.5594113771139908, + "grad_norm": 2.0366973876953125, + "learning_rate": 1e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.7196325063705444, + "num_tokens": 359270589.0, + "step": 14200 + }, + { + "epoch": 1.5595211948166043, + "grad_norm": 2.234872341156006, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.6886312365531921, + "num_tokens": 359299072.0, + "step": 14201 + }, + { + "epoch": 1.559631012519218, + "grad_norm": 2.130439281463623, + "learning_rate": 1e-06, + "loss": 0.8887, + "mean_token_accuracy": 0.7216100096702576, + "num_tokens": 359325957.0, + "step": 14202 + }, + { + "epoch": 1.5597408302218319, + "grad_norm": 2.5055735111236572, + "learning_rate": 1e-06, + "loss": 0.8172, + "mean_token_accuracy": 0.7499128580093384, + "num_tokens": 359347463.0, + "step": 14203 + }, + { + "epoch": 1.5598506479244454, + "grad_norm": 2.3758277893066406, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.7119414806365967, + "num_tokens": 359372427.0, + "step": 14204 + }, + { + "epoch": 1.559960465627059, + "grad_norm": 2.149768114089966, + "learning_rate": 1e-06, + "loss": 0.8738, + "mean_token_accuracy": 0.7247999906539917, + "num_tokens": 359399360.0, + "step": 14205 + }, + { + "epoch": 1.5600702833296727, + "grad_norm": 2.3508338928222656, + "learning_rate": 1e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.6995904445648193, + "num_tokens": 359424430.0, + "step": 14206 + }, + { + "epoch": 1.5601801010322864, + "grad_norm": 2.315361738204956, + "learning_rate": 1e-06, + "loss": 0.9407, + "mean_token_accuracy": 0.7093709707260132, + "num_tokens": 359448215.0, + "step": 14207 + }, + { + "epoch": 1.5602899187349002, + "grad_norm": 2.276259183883667, + "learning_rate": 1e-06, + "loss": 0.8402, + "mean_token_accuracy": 0.7361575961112976, + "num_tokens": 359472657.0, + "step": 14208 + }, + { + "epoch": 1.5603997364375137, + "grad_norm": 2.0245413780212402, + "learning_rate": 1e-06, + "loss": 0.9612, + "mean_token_accuracy": 0.7050784826278687, + "num_tokens": 359503868.0, + "step": 14209 + }, + { + "epoch": 1.5605095541401273, + "grad_norm": 2.2214436531066895, + "learning_rate": 1e-06, + "loss": 0.9024, + "mean_token_accuracy": 0.7215408086776733, + "num_tokens": 359527987.0, + "step": 14210 + }, + { + "epoch": 1.560619371842741, + "grad_norm": 2.562272071838379, + "learning_rate": 1e-06, + "loss": 0.8171, + "mean_token_accuracy": 0.732702910900116, + "num_tokens": 359546526.0, + "step": 14211 + }, + { + "epoch": 1.5607291895453548, + "grad_norm": 2.1762261390686035, + "learning_rate": 1e-06, + "loss": 0.8454, + "mean_token_accuracy": 0.731055498123169, + "num_tokens": 359573539.0, + "step": 14212 + }, + { + "epoch": 1.5608390072479685, + "grad_norm": 2.210008144378662, + "learning_rate": 1e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7273120284080505, + "num_tokens": 359600398.0, + "step": 14213 + }, + { + "epoch": 1.560948824950582, + "grad_norm": 2.1199135780334473, + "learning_rate": 1e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.7196792364120483, + "num_tokens": 359627075.0, + "step": 14214 + }, + { + "epoch": 1.5610586426531956, + "grad_norm": 2.1940512657165527, + "learning_rate": 1e-06, + "loss": 0.8947, + "mean_token_accuracy": 0.7174827456474304, + "num_tokens": 359654669.0, + "step": 14215 + }, + { + "epoch": 1.5611684603558094, + "grad_norm": 2.14034366607666, + "learning_rate": 1e-06, + "loss": 0.9894, + "mean_token_accuracy": 0.7050836086273193, + "num_tokens": 359684637.0, + "step": 14216 + }, + { + "epoch": 1.5612782780584231, + "grad_norm": 2.3966641426086426, + "learning_rate": 1e-06, + "loss": 0.8832, + "mean_token_accuracy": 0.7183963060379028, + "num_tokens": 359707166.0, + "step": 14217 + }, + { + "epoch": 1.5613880957610367, + "grad_norm": 1.9752044677734375, + "learning_rate": 1e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.7265263795852661, + "num_tokens": 359739132.0, + "step": 14218 + }, + { + "epoch": 1.5614979134636502, + "grad_norm": 2.4978978633880615, + "learning_rate": 1e-06, + "loss": 0.8512, + "mean_token_accuracy": 0.7350824475288391, + "num_tokens": 359761034.0, + "step": 14219 + }, + { + "epoch": 1.561607731166264, + "grad_norm": 2.653524160385132, + "learning_rate": 1e-06, + "loss": 0.8764, + "mean_token_accuracy": 0.7250897288322449, + "num_tokens": 359780801.0, + "step": 14220 + }, + { + "epoch": 1.5617175488688777, + "grad_norm": 2.583404779434204, + "learning_rate": 1e-06, + "loss": 0.8663, + "mean_token_accuracy": 0.7313392162322998, + "num_tokens": 359802435.0, + "step": 14221 + }, + { + "epoch": 1.5618273665714915, + "grad_norm": 2.386324405670166, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.7216662168502808, + "num_tokens": 359827645.0, + "step": 14222 + }, + { + "epoch": 1.561937184274105, + "grad_norm": 2.409235715866089, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.710904598236084, + "num_tokens": 359852429.0, + "step": 14223 + }, + { + "epoch": 1.5620470019767185, + "grad_norm": 2.1813881397247314, + "learning_rate": 1e-06, + "loss": 0.8293, + "mean_token_accuracy": 0.7327967286109924, + "num_tokens": 359879163.0, + "step": 14224 + }, + { + "epoch": 1.5621568196793323, + "grad_norm": 2.508958578109741, + "learning_rate": 1e-06, + "loss": 0.9261, + "mean_token_accuracy": 0.7119442820549011, + "num_tokens": 359899362.0, + "step": 14225 + }, + { + "epoch": 1.562266637381946, + "grad_norm": 2.2953989505767822, + "learning_rate": 1e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.7055981159210205, + "num_tokens": 359925455.0, + "step": 14226 + }, + { + "epoch": 1.5623764550845596, + "grad_norm": 2.275470495223999, + "learning_rate": 1e-06, + "loss": 0.8831, + "mean_token_accuracy": 0.7243654727935791, + "num_tokens": 359952223.0, + "step": 14227 + }, + { + "epoch": 1.5624862727871733, + "grad_norm": 2.5651111602783203, + "learning_rate": 1e-06, + "loss": 0.8843, + "mean_token_accuracy": 0.7238823771476746, + "num_tokens": 359972046.0, + "step": 14228 + }, + { + "epoch": 1.5625960904897869, + "grad_norm": 2.307133197784424, + "learning_rate": 1e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.7207468152046204, + "num_tokens": 359998099.0, + "step": 14229 + }, + { + "epoch": 1.5627059081924006, + "grad_norm": 2.196153163909912, + "learning_rate": 1e-06, + "loss": 1.0077, + "mean_token_accuracy": 0.6953243017196655, + "num_tokens": 360025789.0, + "step": 14230 + }, + { + "epoch": 1.5628157258950144, + "grad_norm": 2.2998385429382324, + "learning_rate": 1e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.7223907113075256, + "num_tokens": 360051212.0, + "step": 14231 + }, + { + "epoch": 1.562925543597628, + "grad_norm": 2.350464105606079, + "learning_rate": 1e-06, + "loss": 0.8624, + "mean_token_accuracy": 0.7292325496673584, + "num_tokens": 360074244.0, + "step": 14232 + }, + { + "epoch": 1.5630353613002415, + "grad_norm": 2.2629122734069824, + "learning_rate": 1e-06, + "loss": 0.8807, + "mean_token_accuracy": 0.7225120067596436, + "num_tokens": 360098315.0, + "step": 14233 + }, + { + "epoch": 1.5631451790028552, + "grad_norm": 2.1983673572540283, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7033531665802002, + "num_tokens": 360124391.0, + "step": 14234 + }, + { + "epoch": 1.563254996705469, + "grad_norm": 2.560926914215088, + "learning_rate": 1e-06, + "loss": 0.8119, + "mean_token_accuracy": 0.7518337965011597, + "num_tokens": 360143521.0, + "step": 14235 + }, + { + "epoch": 1.5633648144080827, + "grad_norm": 2.047126054763794, + "learning_rate": 1e-06, + "loss": 0.9025, + "mean_token_accuracy": 0.7181365489959717, + "num_tokens": 360173909.0, + "step": 14236 + }, + { + "epoch": 1.5634746321106963, + "grad_norm": 2.198737382888794, + "learning_rate": 1e-06, + "loss": 0.9274, + "mean_token_accuracy": 0.7125909924507141, + "num_tokens": 360197693.0, + "step": 14237 + }, + { + "epoch": 1.5635844498133098, + "grad_norm": 2.141049385070801, + "learning_rate": 1e-06, + "loss": 0.8699, + "mean_token_accuracy": 0.7243970632553101, + "num_tokens": 360226140.0, + "step": 14238 + }, + { + "epoch": 1.5636942675159236, + "grad_norm": 2.310882091522217, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.7011007070541382, + "num_tokens": 360253230.0, + "step": 14239 + }, + { + "epoch": 1.5638040852185373, + "grad_norm": 2.4339869022369385, + "learning_rate": 1e-06, + "loss": 0.8291, + "mean_token_accuracy": 0.7351305484771729, + "num_tokens": 360273960.0, + "step": 14240 + }, + { + "epoch": 1.5639139029211508, + "grad_norm": 2.1302037239074707, + "learning_rate": 1e-06, + "loss": 0.8984, + "mean_token_accuracy": 0.7257934808731079, + "num_tokens": 360301693.0, + "step": 14241 + }, + { + "epoch": 1.5640237206237646, + "grad_norm": 2.062225103378296, + "learning_rate": 1e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7022994160652161, + "num_tokens": 360332665.0, + "step": 14242 + }, + { + "epoch": 1.5641335383263781, + "grad_norm": 2.535236358642578, + "learning_rate": 1e-06, + "loss": 0.8816, + "mean_token_accuracy": 0.7266558408737183, + "num_tokens": 360354679.0, + "step": 14243 + }, + { + "epoch": 1.564243356028992, + "grad_norm": 2.0338425636291504, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.6998779773712158, + "num_tokens": 360384993.0, + "step": 14244 + }, + { + "epoch": 1.5643531737316057, + "grad_norm": 2.0367231369018555, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7113859057426453, + "num_tokens": 360415351.0, + "step": 14245 + }, + { + "epoch": 1.5644629914342192, + "grad_norm": 2.0735247135162354, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7198611497879028, + "num_tokens": 360443803.0, + "step": 14246 + }, + { + "epoch": 1.5645728091368327, + "grad_norm": 2.1956913471221924, + "learning_rate": 1e-06, + "loss": 0.7977, + "mean_token_accuracy": 0.7432190775871277, + "num_tokens": 360468970.0, + "step": 14247 + }, + { + "epoch": 1.5646826268394465, + "grad_norm": 2.388209819793701, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7155375480651855, + "num_tokens": 360494064.0, + "step": 14248 + }, + { + "epoch": 1.5647924445420602, + "grad_norm": 2.2290444374084473, + "learning_rate": 1e-06, + "loss": 0.8665, + "mean_token_accuracy": 0.7287461161613464, + "num_tokens": 360519567.0, + "step": 14249 + }, + { + "epoch": 1.564902262244674, + "grad_norm": 2.1262083053588867, + "learning_rate": 1e-06, + "loss": 0.901, + "mean_token_accuracy": 0.716259777545929, + "num_tokens": 360545818.0, + "step": 14250 + }, + { + "epoch": 1.5650120799472875, + "grad_norm": 2.5563817024230957, + "learning_rate": 1e-06, + "loss": 0.9085, + "mean_token_accuracy": 0.7283143401145935, + "num_tokens": 360566179.0, + "step": 14251 + }, + { + "epoch": 1.565121897649901, + "grad_norm": 2.3990538120269775, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7074006795883179, + "num_tokens": 360589501.0, + "step": 14252 + }, + { + "epoch": 1.5652317153525148, + "grad_norm": 2.1747725009918213, + "learning_rate": 1e-06, + "loss": 0.9029, + "mean_token_accuracy": 0.7211391925811768, + "num_tokens": 360617316.0, + "step": 14253 + }, + { + "epoch": 1.5653415330551286, + "grad_norm": 2.1483378410339355, + "learning_rate": 1e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.7111719846725464, + "num_tokens": 360645160.0, + "step": 14254 + }, + { + "epoch": 1.565451350757742, + "grad_norm": 2.77778697013855, + "learning_rate": 1e-06, + "loss": 0.7958, + "mean_token_accuracy": 0.7454025745391846, + "num_tokens": 360662672.0, + "step": 14255 + }, + { + "epoch": 1.5655611684603556, + "grad_norm": 2.6187989711761475, + "learning_rate": 1e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7007275819778442, + "num_tokens": 360684261.0, + "step": 14256 + }, + { + "epoch": 1.5656709861629694, + "grad_norm": 2.0052969455718994, + "learning_rate": 1e-06, + "loss": 0.9098, + "mean_token_accuracy": 0.7183007001876831, + "num_tokens": 360718650.0, + "step": 14257 + }, + { + "epoch": 1.5657808038655832, + "grad_norm": 2.510136127471924, + "learning_rate": 1e-06, + "loss": 0.8107, + "mean_token_accuracy": 0.7438482046127319, + "num_tokens": 360740895.0, + "step": 14258 + }, + { + "epoch": 1.565890621568197, + "grad_norm": 2.1141459941864014, + "learning_rate": 1e-06, + "loss": 0.9202, + "mean_token_accuracy": 0.7142020463943481, + "num_tokens": 360772452.0, + "step": 14259 + }, + { + "epoch": 1.5660004392708105, + "grad_norm": 2.24979305267334, + "learning_rate": 1e-06, + "loss": 0.83, + "mean_token_accuracy": 0.7364763617515564, + "num_tokens": 360794769.0, + "step": 14260 + }, + { + "epoch": 1.566110256973424, + "grad_norm": 2.1353516578674316, + "learning_rate": 1e-06, + "loss": 0.9446, + "mean_token_accuracy": 0.7108205556869507, + "num_tokens": 360824502.0, + "step": 14261 + }, + { + "epoch": 1.5662200746760377, + "grad_norm": 2.094759702682495, + "learning_rate": 1e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.7045065760612488, + "num_tokens": 360853992.0, + "step": 14262 + }, + { + "epoch": 1.5663298923786515, + "grad_norm": 2.3243188858032227, + "learning_rate": 1e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7090165019035339, + "num_tokens": 360878266.0, + "step": 14263 + }, + { + "epoch": 1.5664397100812653, + "grad_norm": 2.334745407104492, + "learning_rate": 1e-06, + "loss": 0.8901, + "mean_token_accuracy": 0.7248897552490234, + "num_tokens": 360902127.0, + "step": 14264 + }, + { + "epoch": 1.5665495277838788, + "grad_norm": 2.1890275478363037, + "learning_rate": 1e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7121219635009766, + "num_tokens": 360930308.0, + "step": 14265 + }, + { + "epoch": 1.5666593454864923, + "grad_norm": 2.156965494155884, + "learning_rate": 1e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.70977783203125, + "num_tokens": 360958511.0, + "step": 14266 + }, + { + "epoch": 1.566769163189106, + "grad_norm": 1.91392982006073, + "learning_rate": 1e-06, + "loss": 0.9975, + "mean_token_accuracy": 0.6994854211807251, + "num_tokens": 360992710.0, + "step": 14267 + }, + { + "epoch": 1.5668789808917198, + "grad_norm": 2.2257399559020996, + "learning_rate": 1e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7220073938369751, + "num_tokens": 361018577.0, + "step": 14268 + }, + { + "epoch": 1.5669887985943334, + "grad_norm": 2.2552616596221924, + "learning_rate": 1e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.7077285051345825, + "num_tokens": 361043311.0, + "step": 14269 + }, + { + "epoch": 1.567098616296947, + "grad_norm": 2.3383705615997314, + "learning_rate": 1e-06, + "loss": 0.8501, + "mean_token_accuracy": 0.7293286323547363, + "num_tokens": 361068422.0, + "step": 14270 + }, + { + "epoch": 1.5672084339995607, + "grad_norm": 2.058720350265503, + "learning_rate": 1e-06, + "loss": 0.8899, + "mean_token_accuracy": 0.7240212559700012, + "num_tokens": 361099877.0, + "step": 14271 + }, + { + "epoch": 1.5673182517021744, + "grad_norm": 2.198295831680298, + "learning_rate": 1e-06, + "loss": 0.9195, + "mean_token_accuracy": 0.7148503065109253, + "num_tokens": 361124415.0, + "step": 14272 + }, + { + "epoch": 1.5674280694047882, + "grad_norm": 2.302812337875366, + "learning_rate": 1e-06, + "loss": 0.872, + "mean_token_accuracy": 0.7282634973526001, + "num_tokens": 361149713.0, + "step": 14273 + }, + { + "epoch": 1.5675378871074017, + "grad_norm": 2.2766060829162598, + "learning_rate": 1e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.715840220451355, + "num_tokens": 361174600.0, + "step": 14274 + }, + { + "epoch": 1.5676477048100153, + "grad_norm": 2.3816397190093994, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7160506248474121, + "num_tokens": 361199941.0, + "step": 14275 + }, + { + "epoch": 1.567757522512629, + "grad_norm": 2.443647623062134, + "learning_rate": 1e-06, + "loss": 0.8169, + "mean_token_accuracy": 0.7388218641281128, + "num_tokens": 361221724.0, + "step": 14276 + }, + { + "epoch": 1.5678673402152428, + "grad_norm": 2.2517762184143066, + "learning_rate": 1e-06, + "loss": 0.8376, + "mean_token_accuracy": 0.7341443300247192, + "num_tokens": 361246997.0, + "step": 14277 + }, + { + "epoch": 1.5679771579178565, + "grad_norm": 2.0268537998199463, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.712597131729126, + "num_tokens": 361277527.0, + "step": 14278 + }, + { + "epoch": 1.56808697562047, + "grad_norm": 2.3875880241394043, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7256283164024353, + "num_tokens": 361301835.0, + "step": 14279 + }, + { + "epoch": 1.5681967933230836, + "grad_norm": 2.1529500484466553, + "learning_rate": 1e-06, + "loss": 1.0439, + "mean_token_accuracy": 0.6872002482414246, + "num_tokens": 361332385.0, + "step": 14280 + }, + { + "epoch": 1.5683066110256974, + "grad_norm": 2.1542792320251465, + "learning_rate": 1e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.7240134477615356, + "num_tokens": 361359803.0, + "step": 14281 + }, + { + "epoch": 1.568416428728311, + "grad_norm": 2.3482158184051514, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.7142751216888428, + "num_tokens": 361385469.0, + "step": 14282 + }, + { + "epoch": 1.5685262464309246, + "grad_norm": 1.9176067113876343, + "learning_rate": 1e-06, + "loss": 0.8644, + "mean_token_accuracy": 0.7221209406852722, + "num_tokens": 361416693.0, + "step": 14283 + }, + { + "epoch": 1.5686360641335382, + "grad_norm": 2.114027976989746, + "learning_rate": 1e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.7120870351791382, + "num_tokens": 361443381.0, + "step": 14284 + }, + { + "epoch": 1.568745881836152, + "grad_norm": 2.405233383178711, + "learning_rate": 1e-06, + "loss": 0.8565, + "mean_token_accuracy": 0.7362170219421387, + "num_tokens": 361465423.0, + "step": 14285 + }, + { + "epoch": 1.5688556995387657, + "grad_norm": 2.268033266067505, + "learning_rate": 1e-06, + "loss": 0.8632, + "mean_token_accuracy": 0.7247017621994019, + "num_tokens": 361488314.0, + "step": 14286 + }, + { + "epoch": 1.5689655172413794, + "grad_norm": 2.2861406803131104, + "learning_rate": 1e-06, + "loss": 0.7785, + "mean_token_accuracy": 0.7525866031646729, + "num_tokens": 361510945.0, + "step": 14287 + }, + { + "epoch": 1.569075334943993, + "grad_norm": 2.224256753921509, + "learning_rate": 1e-06, + "loss": 0.859, + "mean_token_accuracy": 0.7278487682342529, + "num_tokens": 361535342.0, + "step": 14288 + }, + { + "epoch": 1.5691851526466065, + "grad_norm": 2.100069284439087, + "learning_rate": 1e-06, + "loss": 0.961, + "mean_token_accuracy": 0.7110661268234253, + "num_tokens": 361563156.0, + "step": 14289 + }, + { + "epoch": 1.5692949703492203, + "grad_norm": 2.211080551147461, + "learning_rate": 1e-06, + "loss": 0.8458, + "mean_token_accuracy": 0.7310173511505127, + "num_tokens": 361587707.0, + "step": 14290 + }, + { + "epoch": 1.569404788051834, + "grad_norm": 1.9858369827270508, + "learning_rate": 1e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.719529390335083, + "num_tokens": 361618878.0, + "step": 14291 + }, + { + "epoch": 1.5695146057544476, + "grad_norm": 2.196821451187134, + "learning_rate": 1e-06, + "loss": 0.8478, + "mean_token_accuracy": 0.7344364523887634, + "num_tokens": 361645085.0, + "step": 14292 + }, + { + "epoch": 1.5696244234570613, + "grad_norm": 2.390576124191284, + "learning_rate": 1e-06, + "loss": 0.8165, + "mean_token_accuracy": 0.742199182510376, + "num_tokens": 361668812.0, + "step": 14293 + }, + { + "epoch": 1.5697342411596749, + "grad_norm": 2.0956592559814453, + "learning_rate": 1e-06, + "loss": 0.8949, + "mean_token_accuracy": 0.7301000356674194, + "num_tokens": 361697613.0, + "step": 14294 + }, + { + "epoch": 1.5698440588622886, + "grad_norm": 2.4360954761505127, + "learning_rate": 1e-06, + "loss": 0.8997, + "mean_token_accuracy": 0.7151147723197937, + "num_tokens": 361719255.0, + "step": 14295 + }, + { + "epoch": 1.5699538765649024, + "grad_norm": 2.3135058879852295, + "learning_rate": 1e-06, + "loss": 0.8156, + "mean_token_accuracy": 0.7453163266181946, + "num_tokens": 361742237.0, + "step": 14296 + }, + { + "epoch": 1.570063694267516, + "grad_norm": 2.066281795501709, + "learning_rate": 1e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.7003089785575867, + "num_tokens": 361773417.0, + "step": 14297 + }, + { + "epoch": 1.5701735119701294, + "grad_norm": 2.3167386054992676, + "learning_rate": 1e-06, + "loss": 0.824, + "mean_token_accuracy": 0.7378451824188232, + "num_tokens": 361796184.0, + "step": 14298 + }, + { + "epoch": 1.5702833296727432, + "grad_norm": 2.117680072784424, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7082542181015015, + "num_tokens": 361825136.0, + "step": 14299 + }, + { + "epoch": 1.570393147375357, + "grad_norm": 2.2200865745544434, + "learning_rate": 1e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.7052841186523438, + "num_tokens": 361854026.0, + "step": 14300 + }, + { + "epoch": 1.5705029650779707, + "grad_norm": 2.210066556930542, + "learning_rate": 1e-06, + "loss": 1.0203, + "mean_token_accuracy": 0.6938633918762207, + "num_tokens": 361881255.0, + "step": 14301 + }, + { + "epoch": 1.5706127827805842, + "grad_norm": 2.4657018184661865, + "learning_rate": 1e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.7121350765228271, + "num_tokens": 361904217.0, + "step": 14302 + }, + { + "epoch": 1.5707226004831978, + "grad_norm": 2.3765649795532227, + "learning_rate": 1e-06, + "loss": 0.7984, + "mean_token_accuracy": 0.7421635389328003, + "num_tokens": 361926366.0, + "step": 14303 + }, + { + "epoch": 1.5708324181858115, + "grad_norm": 2.0290627479553223, + "learning_rate": 1e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7094360589981079, + "num_tokens": 361956390.0, + "step": 14304 + }, + { + "epoch": 1.5709422358884253, + "grad_norm": 2.278616428375244, + "learning_rate": 1e-06, + "loss": 0.8066, + "mean_token_accuracy": 0.7419722080230713, + "num_tokens": 361979293.0, + "step": 14305 + }, + { + "epoch": 1.5710520535910388, + "grad_norm": 2.1141300201416016, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7056962847709656, + "num_tokens": 362008915.0, + "step": 14306 + }, + { + "epoch": 1.5711618712936526, + "grad_norm": 2.2201755046844482, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7006880044937134, + "num_tokens": 362034765.0, + "step": 14307 + }, + { + "epoch": 1.5712716889962661, + "grad_norm": 2.137575149536133, + "learning_rate": 1e-06, + "loss": 0.8415, + "mean_token_accuracy": 0.7445878982543945, + "num_tokens": 362059207.0, + "step": 14308 + }, + { + "epoch": 1.5713815066988799, + "grad_norm": 2.306584596633911, + "learning_rate": 1e-06, + "loss": 0.806, + "mean_token_accuracy": 0.7410476207733154, + "num_tokens": 362082016.0, + "step": 14309 + }, + { + "epoch": 1.5714913244014936, + "grad_norm": 2.164952039718628, + "learning_rate": 1e-06, + "loss": 0.8561, + "mean_token_accuracy": 0.7342942953109741, + "num_tokens": 362109242.0, + "step": 14310 + }, + { + "epoch": 1.5716011421041072, + "grad_norm": 2.0337917804718018, + "learning_rate": 1e-06, + "loss": 0.9186, + "mean_token_accuracy": 0.7096490263938904, + "num_tokens": 362138962.0, + "step": 14311 + }, + { + "epoch": 1.5717109598067207, + "grad_norm": 2.1546947956085205, + "learning_rate": 1e-06, + "loss": 0.9272, + "mean_token_accuracy": 0.7149738073348999, + "num_tokens": 362165877.0, + "step": 14312 + }, + { + "epoch": 1.5718207775093345, + "grad_norm": 2.3704817295074463, + "learning_rate": 1e-06, + "loss": 0.8792, + "mean_token_accuracy": 0.7201540470123291, + "num_tokens": 362188291.0, + "step": 14313 + }, + { + "epoch": 1.5719305952119482, + "grad_norm": 2.6370797157287598, + "learning_rate": 1e-06, + "loss": 0.7675, + "mean_token_accuracy": 0.7501692771911621, + "num_tokens": 362207239.0, + "step": 14314 + }, + { + "epoch": 1.572040412914562, + "grad_norm": 2.3049545288085938, + "learning_rate": 1e-06, + "loss": 0.8144, + "mean_token_accuracy": 0.7456008791923523, + "num_tokens": 362230774.0, + "step": 14315 + }, + { + "epoch": 1.5721502306171755, + "grad_norm": 2.090212821960449, + "learning_rate": 1e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.716880202293396, + "num_tokens": 362261476.0, + "step": 14316 + }, + { + "epoch": 1.572260048319789, + "grad_norm": 2.2805895805358887, + "learning_rate": 1e-06, + "loss": 1.0198, + "mean_token_accuracy": 0.6949401497840881, + "num_tokens": 362287337.0, + "step": 14317 + }, + { + "epoch": 1.5723698660224028, + "grad_norm": 2.237940788269043, + "learning_rate": 1e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.70630943775177, + "num_tokens": 362312531.0, + "step": 14318 + }, + { + "epoch": 1.5724796837250166, + "grad_norm": 2.1503405570983887, + "learning_rate": 1e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.714514434337616, + "num_tokens": 362341242.0, + "step": 14319 + }, + { + "epoch": 1.57258950142763, + "grad_norm": 2.2377493381500244, + "learning_rate": 1e-06, + "loss": 0.8899, + "mean_token_accuracy": 0.7221236228942871, + "num_tokens": 362367357.0, + "step": 14320 + }, + { + "epoch": 1.5726993191302436, + "grad_norm": 2.0590720176696777, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.7015782594680786, + "num_tokens": 362398361.0, + "step": 14321 + }, + { + "epoch": 1.5728091368328574, + "grad_norm": 2.498509645462036, + "learning_rate": 1e-06, + "loss": 0.8939, + "mean_token_accuracy": 0.7200419306755066, + "num_tokens": 362419541.0, + "step": 14322 + }, + { + "epoch": 1.5729189545354711, + "grad_norm": 2.0920751094818115, + "learning_rate": 1e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.7158266305923462, + "num_tokens": 362448283.0, + "step": 14323 + }, + { + "epoch": 1.573028772238085, + "grad_norm": 2.186049699783325, + "learning_rate": 1e-06, + "loss": 0.8977, + "mean_token_accuracy": 0.7156530618667603, + "num_tokens": 362475018.0, + "step": 14324 + }, + { + "epoch": 1.5731385899406984, + "grad_norm": 2.5040152072906494, + "learning_rate": 1e-06, + "loss": 0.9323, + "mean_token_accuracy": 0.7186346650123596, + "num_tokens": 362496601.0, + "step": 14325 + }, + { + "epoch": 1.573248407643312, + "grad_norm": 2.1664857864379883, + "learning_rate": 1e-06, + "loss": 0.8937, + "mean_token_accuracy": 0.7216075658798218, + "num_tokens": 362522185.0, + "step": 14326 + }, + { + "epoch": 1.5733582253459257, + "grad_norm": 1.9602468013763428, + "learning_rate": 1e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7025474905967712, + "num_tokens": 362554532.0, + "step": 14327 + }, + { + "epoch": 1.5734680430485395, + "grad_norm": 2.4016218185424805, + "learning_rate": 1e-06, + "loss": 0.8326, + "mean_token_accuracy": 0.7393984198570251, + "num_tokens": 362576807.0, + "step": 14328 + }, + { + "epoch": 1.5735778607511532, + "grad_norm": 2.31725811958313, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.6974483728408813, + "num_tokens": 362601215.0, + "step": 14329 + }, + { + "epoch": 1.5736876784537668, + "grad_norm": 2.2004928588867188, + "learning_rate": 1e-06, + "loss": 0.9509, + "mean_token_accuracy": 0.7035172581672668, + "num_tokens": 362627163.0, + "step": 14330 + }, + { + "epoch": 1.5737974961563803, + "grad_norm": 2.6389338970184326, + "learning_rate": 1e-06, + "loss": 0.8084, + "mean_token_accuracy": 0.7423003911972046, + "num_tokens": 362645883.0, + "step": 14331 + }, + { + "epoch": 1.573907313858994, + "grad_norm": 2.1782615184783936, + "learning_rate": 1e-06, + "loss": 0.8995, + "mean_token_accuracy": 0.719618558883667, + "num_tokens": 362670911.0, + "step": 14332 + }, + { + "epoch": 1.5740171315616078, + "grad_norm": 2.282650947570801, + "learning_rate": 1e-06, + "loss": 0.9365, + "mean_token_accuracy": 0.7085222601890564, + "num_tokens": 362696242.0, + "step": 14333 + }, + { + "epoch": 1.5741269492642214, + "grad_norm": 2.216963768005371, + "learning_rate": 1e-06, + "loss": 1.0186, + "mean_token_accuracy": 0.686388373374939, + "num_tokens": 362723674.0, + "step": 14334 + }, + { + "epoch": 1.574236766966835, + "grad_norm": 2.1429619789123535, + "learning_rate": 1e-06, + "loss": 0.9766, + "mean_token_accuracy": 0.7034434080123901, + "num_tokens": 362750146.0, + "step": 14335 + }, + { + "epoch": 1.5743465846694487, + "grad_norm": 2.1774535179138184, + "learning_rate": 1e-06, + "loss": 0.908, + "mean_token_accuracy": 0.723635196685791, + "num_tokens": 362776319.0, + "step": 14336 + }, + { + "epoch": 1.5744564023720624, + "grad_norm": 2.3207550048828125, + "learning_rate": 1e-06, + "loss": 0.9552, + "mean_token_accuracy": 0.7041795253753662, + "num_tokens": 362800733.0, + "step": 14337 + }, + { + "epoch": 1.5745662200746762, + "grad_norm": 2.63035249710083, + "learning_rate": 1e-06, + "loss": 0.8576, + "mean_token_accuracy": 0.7269167900085449, + "num_tokens": 362820394.0, + "step": 14338 + }, + { + "epoch": 1.5746760377772897, + "grad_norm": 2.2235615253448486, + "learning_rate": 1e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.7234305143356323, + "num_tokens": 362844978.0, + "step": 14339 + }, + { + "epoch": 1.5747858554799032, + "grad_norm": 2.121042013168335, + "learning_rate": 1e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.7071261405944824, + "num_tokens": 362872271.0, + "step": 14340 + }, + { + "epoch": 1.574895673182517, + "grad_norm": 2.1303701400756836, + "learning_rate": 1e-06, + "loss": 0.8372, + "mean_token_accuracy": 0.7305891513824463, + "num_tokens": 362899618.0, + "step": 14341 + }, + { + "epoch": 1.5750054908851308, + "grad_norm": 2.4807488918304443, + "learning_rate": 1e-06, + "loss": 0.8585, + "mean_token_accuracy": 0.737709105014801, + "num_tokens": 362922427.0, + "step": 14342 + }, + { + "epoch": 1.5751153085877443, + "grad_norm": 2.5165493488311768, + "learning_rate": 1e-06, + "loss": 0.8847, + "mean_token_accuracy": 0.7190883159637451, + "num_tokens": 362943285.0, + "step": 14343 + }, + { + "epoch": 1.575225126290358, + "grad_norm": 2.0860228538513184, + "learning_rate": 1e-06, + "loss": 1.0149, + "mean_token_accuracy": 0.6898173093795776, + "num_tokens": 362974488.0, + "step": 14344 + }, + { + "epoch": 1.5753349439929716, + "grad_norm": 2.142204523086548, + "learning_rate": 1e-06, + "loss": 0.8658, + "mean_token_accuracy": 0.7319076657295227, + "num_tokens": 363001510.0, + "step": 14345 + }, + { + "epoch": 1.5754447616955853, + "grad_norm": 2.5484848022460938, + "learning_rate": 1e-06, + "loss": 0.8362, + "mean_token_accuracy": 0.7310001254081726, + "num_tokens": 363021076.0, + "step": 14346 + }, + { + "epoch": 1.575554579398199, + "grad_norm": 2.2145535945892334, + "learning_rate": 1e-06, + "loss": 0.9896, + "mean_token_accuracy": 0.7018102407455444, + "num_tokens": 363048145.0, + "step": 14347 + }, + { + "epoch": 1.5756643971008126, + "grad_norm": 2.4070382118225098, + "learning_rate": 1e-06, + "loss": 0.8684, + "mean_token_accuracy": 0.7316466569900513, + "num_tokens": 363070696.0, + "step": 14348 + }, + { + "epoch": 1.5757742148034262, + "grad_norm": 2.12587308883667, + "learning_rate": 1e-06, + "loss": 1.0338, + "mean_token_accuracy": 0.700137197971344, + "num_tokens": 363105207.0, + "step": 14349 + }, + { + "epoch": 1.57588403250604, + "grad_norm": 2.431398868560791, + "learning_rate": 1e-06, + "loss": 0.8826, + "mean_token_accuracy": 0.7267098426818848, + "num_tokens": 363127270.0, + "step": 14350 + }, + { + "epoch": 1.5759938502086537, + "grad_norm": 2.25003719329834, + "learning_rate": 1e-06, + "loss": 0.9066, + "mean_token_accuracy": 0.7377593517303467, + "num_tokens": 363151466.0, + "step": 14351 + }, + { + "epoch": 1.5761036679112674, + "grad_norm": 2.2142298221588135, + "learning_rate": 1e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.7038083672523499, + "num_tokens": 363179096.0, + "step": 14352 + }, + { + "epoch": 1.576213485613881, + "grad_norm": 2.558565616607666, + "learning_rate": 1e-06, + "loss": 0.8686, + "mean_token_accuracy": 0.7258760929107666, + "num_tokens": 363199626.0, + "step": 14353 + }, + { + "epoch": 1.5763233033164945, + "grad_norm": 2.2733099460601807, + "learning_rate": 1e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.7167588472366333, + "num_tokens": 363224091.0, + "step": 14354 + }, + { + "epoch": 1.5764331210191083, + "grad_norm": 2.4159486293792725, + "learning_rate": 1e-06, + "loss": 0.8171, + "mean_token_accuracy": 0.743340015411377, + "num_tokens": 363246315.0, + "step": 14355 + }, + { + "epoch": 1.576542938721722, + "grad_norm": 2.1979012489318848, + "learning_rate": 1e-06, + "loss": 1.0051, + "mean_token_accuracy": 0.691474437713623, + "num_tokens": 363275961.0, + "step": 14356 + }, + { + "epoch": 1.5766527564243356, + "grad_norm": 2.2951712608337402, + "learning_rate": 1e-06, + "loss": 0.8288, + "mean_token_accuracy": 0.743678092956543, + "num_tokens": 363298847.0, + "step": 14357 + }, + { + "epoch": 1.5767625741269493, + "grad_norm": 2.2182624340057373, + "learning_rate": 1e-06, + "loss": 0.83, + "mean_token_accuracy": 0.7419766783714294, + "num_tokens": 363325432.0, + "step": 14358 + }, + { + "epoch": 1.5768723918295628, + "grad_norm": 1.96523118019104, + "learning_rate": 1e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.7250031232833862, + "num_tokens": 363356524.0, + "step": 14359 + }, + { + "epoch": 1.5769822095321766, + "grad_norm": 2.2664873600006104, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7020084857940674, + "num_tokens": 363382040.0, + "step": 14360 + }, + { + "epoch": 1.5770920272347904, + "grad_norm": 2.2026665210723877, + "learning_rate": 1e-06, + "loss": 0.9281, + "mean_token_accuracy": 0.7116734385490417, + "num_tokens": 363408927.0, + "step": 14361 + }, + { + "epoch": 1.577201844937404, + "grad_norm": 2.304635524749756, + "learning_rate": 1e-06, + "loss": 0.8932, + "mean_token_accuracy": 0.7208878993988037, + "num_tokens": 363434265.0, + "step": 14362 + }, + { + "epoch": 1.5773116626400174, + "grad_norm": 1.9985334873199463, + "learning_rate": 1e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.7020472884178162, + "num_tokens": 363466167.0, + "step": 14363 + }, + { + "epoch": 1.5774214803426312, + "grad_norm": 2.3168532848358154, + "learning_rate": 1e-06, + "loss": 0.864, + "mean_token_accuracy": 0.7385552525520325, + "num_tokens": 363491714.0, + "step": 14364 + }, + { + "epoch": 1.577531298045245, + "grad_norm": 2.0902199745178223, + "learning_rate": 1e-06, + "loss": 0.9803, + "mean_token_accuracy": 0.6980636119842529, + "num_tokens": 363520352.0, + "step": 14365 + }, + { + "epoch": 1.5776411157478587, + "grad_norm": 1.9404325485229492, + "learning_rate": 1e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.7008694410324097, + "num_tokens": 363553351.0, + "step": 14366 + }, + { + "epoch": 1.5777509334504722, + "grad_norm": 2.2024831771850586, + "learning_rate": 1e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.7126762866973877, + "num_tokens": 363580380.0, + "step": 14367 + }, + { + "epoch": 1.5778607511530858, + "grad_norm": 2.4169411659240723, + "learning_rate": 1e-06, + "loss": 0.879, + "mean_token_accuracy": 0.7311467528343201, + "num_tokens": 363601541.0, + "step": 14368 + }, + { + "epoch": 1.5779705688556995, + "grad_norm": 1.9929537773132324, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7161922454833984, + "num_tokens": 363632637.0, + "step": 14369 + }, + { + "epoch": 1.5780803865583133, + "grad_norm": 2.322195291519165, + "learning_rate": 1e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.7082479596138, + "num_tokens": 363657181.0, + "step": 14370 + }, + { + "epoch": 1.5781902042609268, + "grad_norm": 2.2005677223205566, + "learning_rate": 1e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.7100098133087158, + "num_tokens": 363684390.0, + "step": 14371 + }, + { + "epoch": 1.5783000219635406, + "grad_norm": 2.2582240104675293, + "learning_rate": 1e-06, + "loss": 0.8558, + "mean_token_accuracy": 0.735552966594696, + "num_tokens": 363708628.0, + "step": 14372 + }, + { + "epoch": 1.5784098396661541, + "grad_norm": 2.331125259399414, + "learning_rate": 1e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.7188506126403809, + "num_tokens": 363731500.0, + "step": 14373 + }, + { + "epoch": 1.5785196573687679, + "grad_norm": 2.203117847442627, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.708584189414978, + "num_tokens": 363757768.0, + "step": 14374 + }, + { + "epoch": 1.5786294750713816, + "grad_norm": 2.565873861312866, + "learning_rate": 1e-06, + "loss": 0.8051, + "mean_token_accuracy": 0.7462725639343262, + "num_tokens": 363777305.0, + "step": 14375 + }, + { + "epoch": 1.5787392927739952, + "grad_norm": 2.1824324131011963, + "learning_rate": 1e-06, + "loss": 0.8439, + "mean_token_accuracy": 0.7455494999885559, + "num_tokens": 363803635.0, + "step": 14376 + }, + { + "epoch": 1.5788491104766087, + "grad_norm": 2.328242063522339, + "learning_rate": 1e-06, + "loss": 0.977, + "mean_token_accuracy": 0.7109807729721069, + "num_tokens": 363826422.0, + "step": 14377 + }, + { + "epoch": 1.5789589281792225, + "grad_norm": 2.175175428390503, + "learning_rate": 1e-06, + "loss": 0.958, + "mean_token_accuracy": 0.702928900718689, + "num_tokens": 363852671.0, + "step": 14378 + }, + { + "epoch": 1.5790687458818362, + "grad_norm": 2.1433000564575195, + "learning_rate": 1e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.711764931678772, + "num_tokens": 363880821.0, + "step": 14379 + }, + { + "epoch": 1.57917856358445, + "grad_norm": 2.2631731033325195, + "learning_rate": 1e-06, + "loss": 0.7755, + "mean_token_accuracy": 0.7508878707885742, + "num_tokens": 363903152.0, + "step": 14380 + }, + { + "epoch": 1.5792883812870635, + "grad_norm": 2.119197130203247, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7144824266433716, + "num_tokens": 363930340.0, + "step": 14381 + }, + { + "epoch": 1.579398198989677, + "grad_norm": 2.356097459793091, + "learning_rate": 1e-06, + "loss": 0.8735, + "mean_token_accuracy": 0.7261370420455933, + "num_tokens": 363952008.0, + "step": 14382 + }, + { + "epoch": 1.5795080166922908, + "grad_norm": 2.263338327407837, + "learning_rate": 1e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.7124593257904053, + "num_tokens": 363977067.0, + "step": 14383 + }, + { + "epoch": 1.5796178343949046, + "grad_norm": 2.069182872772217, + "learning_rate": 1e-06, + "loss": 0.8891, + "mean_token_accuracy": 0.7235162258148193, + "num_tokens": 364004190.0, + "step": 14384 + }, + { + "epoch": 1.579727652097518, + "grad_norm": 2.206834554672241, + "learning_rate": 1e-06, + "loss": 0.8962, + "mean_token_accuracy": 0.720249354839325, + "num_tokens": 364029869.0, + "step": 14385 + }, + { + "epoch": 1.5798374698001316, + "grad_norm": 2.4185822010040283, + "learning_rate": 1e-06, + "loss": 1.0407, + "mean_token_accuracy": 0.679385781288147, + "num_tokens": 364055244.0, + "step": 14386 + }, + { + "epoch": 1.5799472875027454, + "grad_norm": 2.241262197494507, + "learning_rate": 1e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.7138206958770752, + "num_tokens": 364080344.0, + "step": 14387 + }, + { + "epoch": 1.5800571052053591, + "grad_norm": 2.851198673248291, + "learning_rate": 1e-06, + "loss": 0.7962, + "mean_token_accuracy": 0.7490670680999756, + "num_tokens": 364096654.0, + "step": 14388 + }, + { + "epoch": 1.580166922907973, + "grad_norm": 2.14481782913208, + "learning_rate": 1e-06, + "loss": 0.849, + "mean_token_accuracy": 0.7334716320037842, + "num_tokens": 364121932.0, + "step": 14389 + }, + { + "epoch": 1.5802767406105864, + "grad_norm": 2.155625820159912, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.7020637392997742, + "num_tokens": 364149860.0, + "step": 14390 + }, + { + "epoch": 1.5803865583132, + "grad_norm": 2.1064112186431885, + "learning_rate": 1e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.7072125673294067, + "num_tokens": 364179429.0, + "step": 14391 + }, + { + "epoch": 1.5804963760158137, + "grad_norm": 2.1748435497283936, + "learning_rate": 1e-06, + "loss": 1.0051, + "mean_token_accuracy": 0.6846020221710205, + "num_tokens": 364209137.0, + "step": 14392 + }, + { + "epoch": 1.5806061937184275, + "grad_norm": 2.204616069793701, + "learning_rate": 1e-06, + "loss": 0.8545, + "mean_token_accuracy": 0.7384659051895142, + "num_tokens": 364234815.0, + "step": 14393 + }, + { + "epoch": 1.5807160114210412, + "grad_norm": 2.688828706741333, + "learning_rate": 1e-06, + "loss": 0.8731, + "mean_token_accuracy": 0.7197366952896118, + "num_tokens": 364255260.0, + "step": 14394 + }, + { + "epoch": 1.5808258291236548, + "grad_norm": 2.7980947494506836, + "learning_rate": 1e-06, + "loss": 0.8121, + "mean_token_accuracy": 0.7383742332458496, + "num_tokens": 364273917.0, + "step": 14395 + }, + { + "epoch": 1.5809356468262683, + "grad_norm": 2.3398776054382324, + "learning_rate": 1e-06, + "loss": 0.9253, + "mean_token_accuracy": 0.7119745016098022, + "num_tokens": 364297935.0, + "step": 14396 + }, + { + "epoch": 1.581045464528882, + "grad_norm": 2.398897647857666, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.717300534248352, + "num_tokens": 364322127.0, + "step": 14397 + }, + { + "epoch": 1.5811552822314958, + "grad_norm": 2.3235201835632324, + "learning_rate": 1e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.721874475479126, + "num_tokens": 364347608.0, + "step": 14398 + }, + { + "epoch": 1.5812650999341094, + "grad_norm": 2.156000852584839, + "learning_rate": 1e-06, + "loss": 0.9098, + "mean_token_accuracy": 0.7111611366271973, + "num_tokens": 364372723.0, + "step": 14399 + }, + { + "epoch": 1.5813749176367229, + "grad_norm": 2.569871664047241, + "learning_rate": 1e-06, + "loss": 0.8485, + "mean_token_accuracy": 0.7305200099945068, + "num_tokens": 364391919.0, + "step": 14400 + }, + { + "epoch": 1.5814847353393366, + "grad_norm": 2.1941370964050293, + "learning_rate": 1e-06, + "loss": 0.838, + "mean_token_accuracy": 0.7327927947044373, + "num_tokens": 364416536.0, + "step": 14401 + }, + { + "epoch": 1.5815945530419504, + "grad_norm": 2.3346338272094727, + "learning_rate": 1e-06, + "loss": 0.8771, + "mean_token_accuracy": 0.7238261699676514, + "num_tokens": 364441930.0, + "step": 14402 + }, + { + "epoch": 1.5817043707445642, + "grad_norm": 2.6295690536499023, + "learning_rate": 1e-06, + "loss": 0.8862, + "mean_token_accuracy": 0.7254924774169922, + "num_tokens": 364463063.0, + "step": 14403 + }, + { + "epoch": 1.5818141884471777, + "grad_norm": 2.045358896255493, + "learning_rate": 1e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.7221760749816895, + "num_tokens": 364491985.0, + "step": 14404 + }, + { + "epoch": 1.5819240061497912, + "grad_norm": 2.1127774715423584, + "learning_rate": 1e-06, + "loss": 1.0151, + "mean_token_accuracy": 0.6981843709945679, + "num_tokens": 364520734.0, + "step": 14405 + }, + { + "epoch": 1.582033823852405, + "grad_norm": 2.182663679122925, + "learning_rate": 1e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.7261061668395996, + "num_tokens": 364545720.0, + "step": 14406 + }, + { + "epoch": 1.5821436415550187, + "grad_norm": 2.0830702781677246, + "learning_rate": 1e-06, + "loss": 0.8733, + "mean_token_accuracy": 0.7286909818649292, + "num_tokens": 364572700.0, + "step": 14407 + }, + { + "epoch": 1.5822534592576323, + "grad_norm": 2.0605051517486572, + "learning_rate": 1e-06, + "loss": 0.7946, + "mean_token_accuracy": 0.7482630610466003, + "num_tokens": 364600415.0, + "step": 14408 + }, + { + "epoch": 1.582363276960246, + "grad_norm": 2.089982509613037, + "learning_rate": 1e-06, + "loss": 0.8191, + "mean_token_accuracy": 0.7393254041671753, + "num_tokens": 364629547.0, + "step": 14409 + }, + { + "epoch": 1.5824730946628596, + "grad_norm": 2.092611789703369, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7062872648239136, + "num_tokens": 364659636.0, + "step": 14410 + }, + { + "epoch": 1.5825829123654733, + "grad_norm": 2.2858779430389404, + "learning_rate": 1e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.7199114561080933, + "num_tokens": 364684177.0, + "step": 14411 + }, + { + "epoch": 1.582692730068087, + "grad_norm": 2.1729304790496826, + "learning_rate": 1e-06, + "loss": 0.7983, + "mean_token_accuracy": 0.7512075901031494, + "num_tokens": 364710269.0, + "step": 14412 + }, + { + "epoch": 1.5828025477707006, + "grad_norm": 2.3340609073638916, + "learning_rate": 1e-06, + "loss": 0.8962, + "mean_token_accuracy": 0.7214241623878479, + "num_tokens": 364733431.0, + "step": 14413 + }, + { + "epoch": 1.5829123654733142, + "grad_norm": 2.0397791862487793, + "learning_rate": 1e-06, + "loss": 0.8895, + "mean_token_accuracy": 0.7195186018943787, + "num_tokens": 364762010.0, + "step": 14414 + }, + { + "epoch": 1.583022183175928, + "grad_norm": 2.180602550506592, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.7125675678253174, + "num_tokens": 364789954.0, + "step": 14415 + }, + { + "epoch": 1.5831320008785417, + "grad_norm": 2.5201728343963623, + "learning_rate": 1e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.721081018447876, + "num_tokens": 364811639.0, + "step": 14416 + }, + { + "epoch": 1.5832418185811554, + "grad_norm": 2.3192198276519775, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7178866863250732, + "num_tokens": 364836399.0, + "step": 14417 + }, + { + "epoch": 1.583351636283769, + "grad_norm": 2.178157329559326, + "learning_rate": 1e-06, + "loss": 0.9109, + "mean_token_accuracy": 0.7178427577018738, + "num_tokens": 364863070.0, + "step": 14418 + }, + { + "epoch": 1.5834614539863825, + "grad_norm": 2.6266815662384033, + "learning_rate": 1e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.7157269716262817, + "num_tokens": 364883619.0, + "step": 14419 + }, + { + "epoch": 1.5835712716889963, + "grad_norm": 2.334711790084839, + "learning_rate": 1e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.7085996866226196, + "num_tokens": 364909822.0, + "step": 14420 + }, + { + "epoch": 1.58368108939161, + "grad_norm": 2.6267244815826416, + "learning_rate": 1e-06, + "loss": 0.7839, + "mean_token_accuracy": 0.7489454746246338, + "num_tokens": 364929136.0, + "step": 14421 + }, + { + "epoch": 1.5837909070942235, + "grad_norm": 2.3567240238189697, + "learning_rate": 1e-06, + "loss": 0.8147, + "mean_token_accuracy": 0.7393417358398438, + "num_tokens": 364951573.0, + "step": 14422 + }, + { + "epoch": 1.5839007247968373, + "grad_norm": 2.5957729816436768, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.7213374972343445, + "num_tokens": 364972910.0, + "step": 14423 + }, + { + "epoch": 1.5840105424994508, + "grad_norm": 2.2690694332122803, + "learning_rate": 1e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.7314648628234863, + "num_tokens": 364996246.0, + "step": 14424 + }, + { + "epoch": 1.5841203602020646, + "grad_norm": 2.3377768993377686, + "learning_rate": 1e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.7211627960205078, + "num_tokens": 365018406.0, + "step": 14425 + }, + { + "epoch": 1.5842301779046783, + "grad_norm": 2.2193281650543213, + "learning_rate": 1e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.7203425168991089, + "num_tokens": 365043889.0, + "step": 14426 + }, + { + "epoch": 1.5843399956072919, + "grad_norm": 2.010115623474121, + "learning_rate": 1e-06, + "loss": 0.9087, + "mean_token_accuracy": 0.71817547082901, + "num_tokens": 365075266.0, + "step": 14427 + }, + { + "epoch": 1.5844498133099054, + "grad_norm": 2.634423017501831, + "learning_rate": 1e-06, + "loss": 0.8814, + "mean_token_accuracy": 0.7265799641609192, + "num_tokens": 365094934.0, + "step": 14428 + }, + { + "epoch": 1.5845596310125192, + "grad_norm": 1.8674474954605103, + "learning_rate": 1e-06, + "loss": 0.9697, + "mean_token_accuracy": 0.7041105628013611, + "num_tokens": 365128689.0, + "step": 14429 + }, + { + "epoch": 1.584669448715133, + "grad_norm": 2.3183600902557373, + "learning_rate": 1e-06, + "loss": 0.9224, + "mean_token_accuracy": 0.7239779233932495, + "num_tokens": 365152238.0, + "step": 14430 + }, + { + "epoch": 1.5847792664177467, + "grad_norm": 2.5350375175476074, + "learning_rate": 1e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.6916845440864563, + "num_tokens": 365174050.0, + "step": 14431 + }, + { + "epoch": 1.5848890841203602, + "grad_norm": 2.2125558853149414, + "learning_rate": 1e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.6958427429199219, + "num_tokens": 365203436.0, + "step": 14432 + }, + { + "epoch": 1.5849989018229738, + "grad_norm": 2.30562686920166, + "learning_rate": 1e-06, + "loss": 0.9182, + "mean_token_accuracy": 0.7175877094268799, + "num_tokens": 365228183.0, + "step": 14433 + }, + { + "epoch": 1.5851087195255875, + "grad_norm": 2.7346019744873047, + "learning_rate": 1e-06, + "loss": 0.7566, + "mean_token_accuracy": 0.7536331415176392, + "num_tokens": 365246457.0, + "step": 14434 + }, + { + "epoch": 1.5852185372282013, + "grad_norm": 2.1441619396209717, + "learning_rate": 1e-06, + "loss": 0.8451, + "mean_token_accuracy": 0.7371268272399902, + "num_tokens": 365271890.0, + "step": 14435 + }, + { + "epoch": 1.5853283549308148, + "grad_norm": 2.4069390296936035, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7240042090415955, + "num_tokens": 365294606.0, + "step": 14436 + }, + { + "epoch": 1.5854381726334283, + "grad_norm": 2.7718958854675293, + "learning_rate": 1e-06, + "loss": 0.8305, + "mean_token_accuracy": 0.7384543418884277, + "num_tokens": 365314242.0, + "step": 14437 + }, + { + "epoch": 1.585547990336042, + "grad_norm": 2.2475545406341553, + "learning_rate": 1e-06, + "loss": 0.8745, + "mean_token_accuracy": 0.726418137550354, + "num_tokens": 365342452.0, + "step": 14438 + }, + { + "epoch": 1.5856578080386559, + "grad_norm": 2.118316650390625, + "learning_rate": 1e-06, + "loss": 0.8148, + "mean_token_accuracy": 0.7371489405632019, + "num_tokens": 365369282.0, + "step": 14439 + }, + { + "epoch": 1.5857676257412696, + "grad_norm": 2.362445116043091, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.712110161781311, + "num_tokens": 365395344.0, + "step": 14440 + }, + { + "epoch": 1.5858774434438832, + "grad_norm": 2.3848330974578857, + "learning_rate": 1e-06, + "loss": 0.893, + "mean_token_accuracy": 0.7201865911483765, + "num_tokens": 365419589.0, + "step": 14441 + }, + { + "epoch": 1.5859872611464967, + "grad_norm": 2.173800468444824, + "learning_rate": 1e-06, + "loss": 0.9014, + "mean_token_accuracy": 0.7153961658477783, + "num_tokens": 365447087.0, + "step": 14442 + }, + { + "epoch": 1.5860970788491104, + "grad_norm": 2.2431435585021973, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.719788670539856, + "num_tokens": 365472003.0, + "step": 14443 + }, + { + "epoch": 1.5862068965517242, + "grad_norm": 2.368333339691162, + "learning_rate": 1e-06, + "loss": 0.943, + "mean_token_accuracy": 0.7084822654724121, + "num_tokens": 365494626.0, + "step": 14444 + }, + { + "epoch": 1.586316714254338, + "grad_norm": 2.2921712398529053, + "learning_rate": 1e-06, + "loss": 0.8568, + "mean_token_accuracy": 0.7284618020057678, + "num_tokens": 365518476.0, + "step": 14445 + }, + { + "epoch": 1.5864265319569515, + "grad_norm": 2.0591864585876465, + "learning_rate": 1e-06, + "loss": 0.9198, + "mean_token_accuracy": 0.7233999371528625, + "num_tokens": 365548389.0, + "step": 14446 + }, + { + "epoch": 1.586536349659565, + "grad_norm": 2.3718507289886475, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.7086794376373291, + "num_tokens": 365572481.0, + "step": 14447 + }, + { + "epoch": 1.5866461673621788, + "grad_norm": 2.533752918243408, + "learning_rate": 1e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.721879780292511, + "num_tokens": 365594078.0, + "step": 14448 + }, + { + "epoch": 1.5867559850647925, + "grad_norm": 2.27820086479187, + "learning_rate": 1e-06, + "loss": 0.8928, + "mean_token_accuracy": 0.724329948425293, + "num_tokens": 365618128.0, + "step": 14449 + }, + { + "epoch": 1.586865802767406, + "grad_norm": 2.3773999214172363, + "learning_rate": 1e-06, + "loss": 0.8393, + "mean_token_accuracy": 0.746900200843811, + "num_tokens": 365640012.0, + "step": 14450 + }, + { + "epoch": 1.5869756204700196, + "grad_norm": 2.3712949752807617, + "learning_rate": 1e-06, + "loss": 0.8395, + "mean_token_accuracy": 0.7423639893531799, + "num_tokens": 365662847.0, + "step": 14451 + }, + { + "epoch": 1.5870854381726334, + "grad_norm": 2.530686855316162, + "learning_rate": 1e-06, + "loss": 0.8655, + "mean_token_accuracy": 0.7255704998970032, + "num_tokens": 365682668.0, + "step": 14452 + }, + { + "epoch": 1.5871952558752471, + "grad_norm": 2.3951416015625, + "learning_rate": 1e-06, + "loss": 0.9131, + "mean_token_accuracy": 0.7164078950881958, + "num_tokens": 365706457.0, + "step": 14453 + }, + { + "epoch": 1.5873050735778609, + "grad_norm": 2.4094908237457275, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.717599093914032, + "num_tokens": 365731558.0, + "step": 14454 + }, + { + "epoch": 1.5874148912804744, + "grad_norm": 2.2828049659729004, + "learning_rate": 1e-06, + "loss": 0.8135, + "mean_token_accuracy": 0.7350450754165649, + "num_tokens": 365754925.0, + "step": 14455 + }, + { + "epoch": 1.587524708983088, + "grad_norm": 2.1835896968841553, + "learning_rate": 1e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.7086678743362427, + "num_tokens": 365781461.0, + "step": 14456 + }, + { + "epoch": 1.5876345266857017, + "grad_norm": 2.129866123199463, + "learning_rate": 1e-06, + "loss": 0.8563, + "mean_token_accuracy": 0.7353543639183044, + "num_tokens": 365808756.0, + "step": 14457 + }, + { + "epoch": 1.5877443443883155, + "grad_norm": 2.2390615940093994, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7033720016479492, + "num_tokens": 365834887.0, + "step": 14458 + }, + { + "epoch": 1.5878541620909292, + "grad_norm": 2.141144037246704, + "learning_rate": 1e-06, + "loss": 0.9966, + "mean_token_accuracy": 0.6999841928482056, + "num_tokens": 365863787.0, + "step": 14459 + }, + { + "epoch": 1.5879639797935428, + "grad_norm": 2.276566982269287, + "learning_rate": 1e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7191067934036255, + "num_tokens": 365889838.0, + "step": 14460 + }, + { + "epoch": 1.5880737974961563, + "grad_norm": 2.1707868576049805, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.7099337577819824, + "num_tokens": 365915844.0, + "step": 14461 + }, + { + "epoch": 1.58818361519877, + "grad_norm": 2.2301650047302246, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.6950023174285889, + "num_tokens": 365942599.0, + "step": 14462 + }, + { + "epoch": 1.5882934329013838, + "grad_norm": 2.174940824508667, + "learning_rate": 1e-06, + "loss": 0.8987, + "mean_token_accuracy": 0.7206610441207886, + "num_tokens": 365969581.0, + "step": 14463 + }, + { + "epoch": 1.5884032506039973, + "grad_norm": 2.5417301654815674, + "learning_rate": 1e-06, + "loss": 0.8313, + "mean_token_accuracy": 0.735466480255127, + "num_tokens": 365989461.0, + "step": 14464 + }, + { + "epoch": 1.5885130683066109, + "grad_norm": 2.2407608032226562, + "learning_rate": 1e-06, + "loss": 0.7967, + "mean_token_accuracy": 0.7504367828369141, + "num_tokens": 366014249.0, + "step": 14465 + }, + { + "epoch": 1.5886228860092246, + "grad_norm": 2.2564034461975098, + "learning_rate": 1e-06, + "loss": 0.9239, + "mean_token_accuracy": 0.7107820510864258, + "num_tokens": 366040469.0, + "step": 14466 + }, + { + "epoch": 1.5887327037118384, + "grad_norm": 2.110018253326416, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7128949761390686, + "num_tokens": 366068228.0, + "step": 14467 + }, + { + "epoch": 1.5888425214144521, + "grad_norm": 2.298832654953003, + "learning_rate": 1e-06, + "loss": 0.8286, + "mean_token_accuracy": 0.7449268102645874, + "num_tokens": 366091507.0, + "step": 14468 + }, + { + "epoch": 1.5889523391170657, + "grad_norm": 2.284208297729492, + "learning_rate": 1e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7216882705688477, + "num_tokens": 366114041.0, + "step": 14469 + }, + { + "epoch": 1.5890621568196792, + "grad_norm": 2.1558406352996826, + "learning_rate": 1e-06, + "loss": 0.8774, + "mean_token_accuracy": 0.7243427634239197, + "num_tokens": 366141209.0, + "step": 14470 + }, + { + "epoch": 1.589171974522293, + "grad_norm": 2.0185370445251465, + "learning_rate": 1e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.7183468341827393, + "num_tokens": 366173521.0, + "step": 14471 + }, + { + "epoch": 1.5892817922249067, + "grad_norm": 2.326976776123047, + "learning_rate": 1e-06, + "loss": 0.8075, + "mean_token_accuracy": 0.7358206510543823, + "num_tokens": 366196768.0, + "step": 14472 + }, + { + "epoch": 1.5893916099275203, + "grad_norm": 2.557509660720825, + "learning_rate": 1e-06, + "loss": 0.8679, + "mean_token_accuracy": 0.7297669053077698, + "num_tokens": 366215602.0, + "step": 14473 + }, + { + "epoch": 1.589501427630134, + "grad_norm": 1.9909801483154297, + "learning_rate": 1e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7163776159286499, + "num_tokens": 366245450.0, + "step": 14474 + }, + { + "epoch": 1.5896112453327476, + "grad_norm": 2.213996648788452, + "learning_rate": 1e-06, + "loss": 0.8743, + "mean_token_accuracy": 0.7250576019287109, + "num_tokens": 366269839.0, + "step": 14475 + }, + { + "epoch": 1.5897210630353613, + "grad_norm": 2.207615375518799, + "learning_rate": 1e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.7118743658065796, + "num_tokens": 366297652.0, + "step": 14476 + }, + { + "epoch": 1.589830880737975, + "grad_norm": 1.970215916633606, + "learning_rate": 1e-06, + "loss": 0.9188, + "mean_token_accuracy": 0.7260304689407349, + "num_tokens": 366331792.0, + "step": 14477 + }, + { + "epoch": 1.5899406984405886, + "grad_norm": 2.0856313705444336, + "learning_rate": 1e-06, + "loss": 0.912, + "mean_token_accuracy": 0.7204333543777466, + "num_tokens": 366360626.0, + "step": 14478 + }, + { + "epoch": 1.5900505161432021, + "grad_norm": 2.0551328659057617, + "learning_rate": 1e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.7240384817123413, + "num_tokens": 366388176.0, + "step": 14479 + }, + { + "epoch": 1.590160333845816, + "grad_norm": 2.3632545471191406, + "learning_rate": 1e-06, + "loss": 0.9316, + "mean_token_accuracy": 0.7068182229995728, + "num_tokens": 366413668.0, + "step": 14480 + }, + { + "epoch": 1.5902701515484297, + "grad_norm": 2.330042839050293, + "learning_rate": 1e-06, + "loss": 0.8983, + "mean_token_accuracy": 0.7229039669036865, + "num_tokens": 366437633.0, + "step": 14481 + }, + { + "epoch": 1.5903799692510434, + "grad_norm": 2.369607448577881, + "learning_rate": 1e-06, + "loss": 0.7875, + "mean_token_accuracy": 0.754579484462738, + "num_tokens": 366459373.0, + "step": 14482 + }, + { + "epoch": 1.590489786953657, + "grad_norm": 2.3255958557128906, + "learning_rate": 1e-06, + "loss": 0.8108, + "mean_token_accuracy": 0.7438479065895081, + "num_tokens": 366481249.0, + "step": 14483 + }, + { + "epoch": 1.5905996046562705, + "grad_norm": 2.059696912765503, + "learning_rate": 1e-06, + "loss": 0.8489, + "mean_token_accuracy": 0.7378829717636108, + "num_tokens": 366510123.0, + "step": 14484 + }, + { + "epoch": 1.5907094223588842, + "grad_norm": 2.340162515640259, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.7001912593841553, + "num_tokens": 366535958.0, + "step": 14485 + }, + { + "epoch": 1.590819240061498, + "grad_norm": 2.0976295471191406, + "learning_rate": 1e-06, + "loss": 0.8994, + "mean_token_accuracy": 0.7204636931419373, + "num_tokens": 366564157.0, + "step": 14486 + }, + { + "epoch": 1.5909290577641115, + "grad_norm": 2.3963661193847656, + "learning_rate": 1e-06, + "loss": 0.844, + "mean_token_accuracy": 0.733600378036499, + "num_tokens": 366588129.0, + "step": 14487 + }, + { + "epoch": 1.5910388754667253, + "grad_norm": 2.210822820663452, + "learning_rate": 1e-06, + "loss": 0.8911, + "mean_token_accuracy": 0.7205793857574463, + "num_tokens": 366613726.0, + "step": 14488 + }, + { + "epoch": 1.5911486931693388, + "grad_norm": 2.5976691246032715, + "learning_rate": 1e-06, + "loss": 0.905, + "mean_token_accuracy": 0.7141633629798889, + "num_tokens": 366635138.0, + "step": 14489 + }, + { + "epoch": 1.5912585108719526, + "grad_norm": 2.0329532623291016, + "learning_rate": 1e-06, + "loss": 0.8442, + "mean_token_accuracy": 0.7317051887512207, + "num_tokens": 366663483.0, + "step": 14490 + }, + { + "epoch": 1.5913683285745663, + "grad_norm": 1.9485210180282593, + "learning_rate": 1e-06, + "loss": 1.0198, + "mean_token_accuracy": 0.6860314607620239, + "num_tokens": 366697781.0, + "step": 14491 + }, + { + "epoch": 1.5914781462771799, + "grad_norm": 1.959438681602478, + "learning_rate": 1e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7001076936721802, + "num_tokens": 366731000.0, + "step": 14492 + }, + { + "epoch": 1.5915879639797934, + "grad_norm": 2.0279617309570312, + "learning_rate": 1e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7023254036903381, + "num_tokens": 366762812.0, + "step": 14493 + }, + { + "epoch": 1.5916977816824072, + "grad_norm": 2.5258383750915527, + "learning_rate": 1e-06, + "loss": 0.8787, + "mean_token_accuracy": 0.7257768511772156, + "num_tokens": 366783776.0, + "step": 14494 + }, + { + "epoch": 1.591807599385021, + "grad_norm": 2.40936279296875, + "learning_rate": 1e-06, + "loss": 0.8871, + "mean_token_accuracy": 0.7345188856124878, + "num_tokens": 366806480.0, + "step": 14495 + }, + { + "epoch": 1.5919174170876347, + "grad_norm": 2.2284066677093506, + "learning_rate": 1e-06, + "loss": 0.7611, + "mean_token_accuracy": 0.7644726037979126, + "num_tokens": 366831242.0, + "step": 14496 + }, + { + "epoch": 1.5920272347902482, + "grad_norm": 1.837626338005066, + "learning_rate": 1e-06, + "loss": 0.8149, + "mean_token_accuracy": 0.7400708794593811, + "num_tokens": 366863727.0, + "step": 14497 + }, + { + "epoch": 1.5921370524928617, + "grad_norm": 1.9397035837173462, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.6999233365058899, + "num_tokens": 366897727.0, + "step": 14498 + }, + { + "epoch": 1.5922468701954755, + "grad_norm": 2.062938928604126, + "learning_rate": 1e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.7039910554885864, + "num_tokens": 366928891.0, + "step": 14499 + }, + { + "epoch": 1.5923566878980893, + "grad_norm": 2.429534912109375, + "learning_rate": 1e-06, + "loss": 0.823, + "mean_token_accuracy": 0.7402830123901367, + "num_tokens": 366950732.0, + "step": 14500 + }, + { + "epoch": 1.5924665056007028, + "grad_norm": 2.5660653114318848, + "learning_rate": 1e-06, + "loss": 0.7038, + "mean_token_accuracy": 0.767798125743866, + "num_tokens": 366969806.0, + "step": 14501 + }, + { + "epoch": 1.5925763233033163, + "grad_norm": 2.0709636211395264, + "learning_rate": 1e-06, + "loss": 0.8908, + "mean_token_accuracy": 0.723864734172821, + "num_tokens": 367000178.0, + "step": 14502 + }, + { + "epoch": 1.59268614100593, + "grad_norm": 2.3876142501831055, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7087666392326355, + "num_tokens": 367027092.0, + "step": 14503 + }, + { + "epoch": 1.5927959587085438, + "grad_norm": 2.420037269592285, + "learning_rate": 1e-06, + "loss": 1.0086, + "mean_token_accuracy": 0.6946307420730591, + "num_tokens": 367051000.0, + "step": 14504 + }, + { + "epoch": 1.5929057764111576, + "grad_norm": 2.5399081707000732, + "learning_rate": 1e-06, + "loss": 0.9046, + "mean_token_accuracy": 0.7182080745697021, + "num_tokens": 367070881.0, + "step": 14505 + }, + { + "epoch": 1.5930155941137711, + "grad_norm": 2.383843183517456, + "learning_rate": 1e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.7232096195220947, + "num_tokens": 367094684.0, + "step": 14506 + }, + { + "epoch": 1.5931254118163847, + "grad_norm": 2.119311571121216, + "learning_rate": 1e-06, + "loss": 0.8959, + "mean_token_accuracy": 0.7272084951400757, + "num_tokens": 367121759.0, + "step": 14507 + }, + { + "epoch": 1.5932352295189984, + "grad_norm": 2.027942180633545, + "learning_rate": 1e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.7136427760124207, + "num_tokens": 367151086.0, + "step": 14508 + }, + { + "epoch": 1.5933450472216122, + "grad_norm": 2.45774507522583, + "learning_rate": 1e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.7219985723495483, + "num_tokens": 367172012.0, + "step": 14509 + }, + { + "epoch": 1.593454864924226, + "grad_norm": 2.292973518371582, + "learning_rate": 1e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.71617591381073, + "num_tokens": 367196650.0, + "step": 14510 + }, + { + "epoch": 1.5935646826268395, + "grad_norm": 2.2415525913238525, + "learning_rate": 1e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.7158052325248718, + "num_tokens": 367222607.0, + "step": 14511 + }, + { + "epoch": 1.593674500329453, + "grad_norm": 2.219773292541504, + "learning_rate": 1e-06, + "loss": 0.8627, + "mean_token_accuracy": 0.7260105013847351, + "num_tokens": 367248651.0, + "step": 14512 + }, + { + "epoch": 1.5937843180320668, + "grad_norm": 2.4170708656311035, + "learning_rate": 1e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.7106384038925171, + "num_tokens": 367271065.0, + "step": 14513 + }, + { + "epoch": 1.5938941357346805, + "grad_norm": 2.344494342803955, + "learning_rate": 1e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.7063945531845093, + "num_tokens": 367296251.0, + "step": 14514 + }, + { + "epoch": 1.594003953437294, + "grad_norm": 2.2752797603607178, + "learning_rate": 1e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.7140158414840698, + "num_tokens": 367321083.0, + "step": 14515 + }, + { + "epoch": 1.5941137711399076, + "grad_norm": 2.6548104286193848, + "learning_rate": 1e-06, + "loss": 0.8455, + "mean_token_accuracy": 0.7319042086601257, + "num_tokens": 367340101.0, + "step": 14516 + }, + { + "epoch": 1.5942235888425214, + "grad_norm": 2.348742723464966, + "learning_rate": 1e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7100014686584473, + "num_tokens": 367364271.0, + "step": 14517 + }, + { + "epoch": 1.5943334065451351, + "grad_norm": 2.133378028869629, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.6961430311203003, + "num_tokens": 367395499.0, + "step": 14518 + }, + { + "epoch": 1.5944432242477489, + "grad_norm": 2.253845453262329, + "learning_rate": 1e-06, + "loss": 0.85, + "mean_token_accuracy": 0.7371587157249451, + "num_tokens": 367419885.0, + "step": 14519 + }, + { + "epoch": 1.5945530419503624, + "grad_norm": 2.4301629066467285, + "learning_rate": 1e-06, + "loss": 0.8814, + "mean_token_accuracy": 0.725679874420166, + "num_tokens": 367441234.0, + "step": 14520 + }, + { + "epoch": 1.594662859652976, + "grad_norm": 2.3035006523132324, + "learning_rate": 1e-06, + "loss": 0.8734, + "mean_token_accuracy": 0.7251405715942383, + "num_tokens": 367466869.0, + "step": 14521 + }, + { + "epoch": 1.5947726773555897, + "grad_norm": 2.3570971488952637, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.7103741765022278, + "num_tokens": 367490528.0, + "step": 14522 + }, + { + "epoch": 1.5948824950582035, + "grad_norm": 2.195996046066284, + "learning_rate": 1e-06, + "loss": 1.0114, + "mean_token_accuracy": 0.6959556341171265, + "num_tokens": 367518361.0, + "step": 14523 + }, + { + "epoch": 1.5949923127608172, + "grad_norm": 2.1007583141326904, + "learning_rate": 1e-06, + "loss": 0.8504, + "mean_token_accuracy": 0.7380279302597046, + "num_tokens": 367546658.0, + "step": 14524 + }, + { + "epoch": 1.5951021304634307, + "grad_norm": 2.3253579139709473, + "learning_rate": 1e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.7119400501251221, + "num_tokens": 367570607.0, + "step": 14525 + }, + { + "epoch": 1.5952119481660443, + "grad_norm": 2.266885995864868, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.7020711898803711, + "num_tokens": 367595628.0, + "step": 14526 + }, + { + "epoch": 1.595321765868658, + "grad_norm": 2.3930771350860596, + "learning_rate": 1e-06, + "loss": 0.913, + "mean_token_accuracy": 0.716941237449646, + "num_tokens": 367618848.0, + "step": 14527 + }, + { + "epoch": 1.5954315835712718, + "grad_norm": 2.2293860912323, + "learning_rate": 1e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.7246312499046326, + "num_tokens": 367645729.0, + "step": 14528 + }, + { + "epoch": 1.5955414012738853, + "grad_norm": 2.643202066421509, + "learning_rate": 1e-06, + "loss": 0.8957, + "mean_token_accuracy": 0.7213864922523499, + "num_tokens": 367667339.0, + "step": 14529 + }, + { + "epoch": 1.5956512189764989, + "grad_norm": 2.098804473876953, + "learning_rate": 1e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.7202569246292114, + "num_tokens": 367696015.0, + "step": 14530 + }, + { + "epoch": 1.5957610366791126, + "grad_norm": 2.451343059539795, + "learning_rate": 1e-06, + "loss": 0.8571, + "mean_token_accuracy": 0.7268457412719727, + "num_tokens": 367718969.0, + "step": 14531 + }, + { + "epoch": 1.5958708543817264, + "grad_norm": 2.5880463123321533, + "learning_rate": 1e-06, + "loss": 0.7953, + "mean_token_accuracy": 0.755354642868042, + "num_tokens": 367737606.0, + "step": 14532 + }, + { + "epoch": 1.5959806720843401, + "grad_norm": 2.2354607582092285, + "learning_rate": 1e-06, + "loss": 0.9079, + "mean_token_accuracy": 0.7229874134063721, + "num_tokens": 367763722.0, + "step": 14533 + }, + { + "epoch": 1.5960904897869537, + "grad_norm": 2.5195019245147705, + "learning_rate": 1e-06, + "loss": 0.8543, + "mean_token_accuracy": 0.7373077273368835, + "num_tokens": 367784386.0, + "step": 14534 + }, + { + "epoch": 1.5962003074895672, + "grad_norm": 2.422339916229248, + "learning_rate": 1e-06, + "loss": 0.983, + "mean_token_accuracy": 0.7015281915664673, + "num_tokens": 367808492.0, + "step": 14535 + }, + { + "epoch": 1.596310125192181, + "grad_norm": 2.275674819946289, + "learning_rate": 1e-06, + "loss": 0.8902, + "mean_token_accuracy": 0.7165026068687439, + "num_tokens": 367833201.0, + "step": 14536 + }, + { + "epoch": 1.5964199428947947, + "grad_norm": 2.315002202987671, + "learning_rate": 1e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.7000312805175781, + "num_tokens": 367859811.0, + "step": 14537 + }, + { + "epoch": 1.5965297605974083, + "grad_norm": 2.217689037322998, + "learning_rate": 1e-06, + "loss": 0.8407, + "mean_token_accuracy": 0.7368988394737244, + "num_tokens": 367885549.0, + "step": 14538 + }, + { + "epoch": 1.596639578300022, + "grad_norm": 2.165379524230957, + "learning_rate": 1e-06, + "loss": 0.8758, + "mean_token_accuracy": 0.7241040468215942, + "num_tokens": 367912289.0, + "step": 14539 + }, + { + "epoch": 1.5967493960026355, + "grad_norm": 2.176694393157959, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7140819430351257, + "num_tokens": 367939760.0, + "step": 14540 + }, + { + "epoch": 1.5968592137052493, + "grad_norm": 2.1785242557525635, + "learning_rate": 1e-06, + "loss": 0.9276, + "mean_token_accuracy": 0.710169792175293, + "num_tokens": 367966806.0, + "step": 14541 + }, + { + "epoch": 1.596969031407863, + "grad_norm": 2.5320537090301514, + "learning_rate": 1e-06, + "loss": 0.904, + "mean_token_accuracy": 0.7233172059059143, + "num_tokens": 367987557.0, + "step": 14542 + }, + { + "epoch": 1.5970788491104766, + "grad_norm": 2.3994946479797363, + "learning_rate": 1e-06, + "loss": 0.8543, + "mean_token_accuracy": 0.7344212532043457, + "num_tokens": 368008733.0, + "step": 14543 + }, + { + "epoch": 1.5971886668130901, + "grad_norm": 2.3479509353637695, + "learning_rate": 1e-06, + "loss": 0.8871, + "mean_token_accuracy": 0.7255732417106628, + "num_tokens": 368030987.0, + "step": 14544 + }, + { + "epoch": 1.5972984845157039, + "grad_norm": 2.5094096660614014, + "learning_rate": 1e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.723639965057373, + "num_tokens": 368051579.0, + "step": 14545 + }, + { + "epoch": 1.5974083022183176, + "grad_norm": 2.163411855697632, + "learning_rate": 1e-06, + "loss": 0.9837, + "mean_token_accuracy": 0.7032519578933716, + "num_tokens": 368079587.0, + "step": 14546 + }, + { + "epoch": 1.5975181199209314, + "grad_norm": 2.5209479331970215, + "learning_rate": 1e-06, + "loss": 0.8885, + "mean_token_accuracy": 0.7257887125015259, + "num_tokens": 368101164.0, + "step": 14547 + }, + { + "epoch": 1.597627937623545, + "grad_norm": 2.1118507385253906, + "learning_rate": 1e-06, + "loss": 0.835, + "mean_token_accuracy": 0.7403655648231506, + "num_tokens": 368129642.0, + "step": 14548 + }, + { + "epoch": 1.5977377553261585, + "grad_norm": 2.2157301902770996, + "learning_rate": 1e-06, + "loss": 0.8631, + "mean_token_accuracy": 0.7328548431396484, + "num_tokens": 368155534.0, + "step": 14549 + }, + { + "epoch": 1.5978475730287722, + "grad_norm": 2.262622594833374, + "learning_rate": 1e-06, + "loss": 0.9487, + "mean_token_accuracy": 0.7131580114364624, + "num_tokens": 368181921.0, + "step": 14550 + }, + { + "epoch": 1.597957390731386, + "grad_norm": 2.3269846439361572, + "learning_rate": 1e-06, + "loss": 0.8028, + "mean_token_accuracy": 0.7402797937393188, + "num_tokens": 368204452.0, + "step": 14551 + }, + { + "epoch": 1.5980672084339995, + "grad_norm": 2.302426815032959, + "learning_rate": 1e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.7192611694335938, + "num_tokens": 368229396.0, + "step": 14552 + }, + { + "epoch": 1.5981770261366133, + "grad_norm": 2.277599811553955, + "learning_rate": 1e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.7186038494110107, + "num_tokens": 368254228.0, + "step": 14553 + }, + { + "epoch": 1.5982868438392268, + "grad_norm": 2.259051561355591, + "learning_rate": 1e-06, + "loss": 0.8133, + "mean_token_accuracy": 0.7411748170852661, + "num_tokens": 368278860.0, + "step": 14554 + }, + { + "epoch": 1.5983966615418406, + "grad_norm": 2.789517402648926, + "learning_rate": 1e-06, + "loss": 0.7976, + "mean_token_accuracy": 0.7463840246200562, + "num_tokens": 368297059.0, + "step": 14555 + }, + { + "epoch": 1.5985064792444543, + "grad_norm": 2.712221622467041, + "learning_rate": 1e-06, + "loss": 0.8866, + "mean_token_accuracy": 0.7195388674736023, + "num_tokens": 368316758.0, + "step": 14556 + }, + { + "epoch": 1.5986162969470679, + "grad_norm": 2.1965718269348145, + "learning_rate": 1e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.7098680734634399, + "num_tokens": 368346398.0, + "step": 14557 + }, + { + "epoch": 1.5987261146496814, + "grad_norm": 2.72249436378479, + "learning_rate": 1e-06, + "loss": 0.8752, + "mean_token_accuracy": 0.7211802005767822, + "num_tokens": 368363686.0, + "step": 14558 + }, + { + "epoch": 1.5988359323522952, + "grad_norm": 2.190384864807129, + "learning_rate": 1e-06, + "loss": 0.881, + "mean_token_accuracy": 0.7257691621780396, + "num_tokens": 368389975.0, + "step": 14559 + }, + { + "epoch": 1.598945750054909, + "grad_norm": 2.292308807373047, + "learning_rate": 1e-06, + "loss": 0.8833, + "mean_token_accuracy": 0.7245887517929077, + "num_tokens": 368414414.0, + "step": 14560 + }, + { + "epoch": 1.5990555677575227, + "grad_norm": 2.2517404556274414, + "learning_rate": 1e-06, + "loss": 0.9016, + "mean_token_accuracy": 0.7173587083816528, + "num_tokens": 368440784.0, + "step": 14561 + }, + { + "epoch": 1.5991653854601362, + "grad_norm": 2.375808000564575, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.7168551087379456, + "num_tokens": 368466570.0, + "step": 14562 + }, + { + "epoch": 1.5992752031627497, + "grad_norm": 2.4687705039978027, + "learning_rate": 1e-06, + "loss": 0.8877, + "mean_token_accuracy": 0.7245213985443115, + "num_tokens": 368488181.0, + "step": 14563 + }, + { + "epoch": 1.5993850208653635, + "grad_norm": 2.16597843170166, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7123115062713623, + "num_tokens": 368515687.0, + "step": 14564 + }, + { + "epoch": 1.5994948385679773, + "grad_norm": 2.2550973892211914, + "learning_rate": 1e-06, + "loss": 0.8977, + "mean_token_accuracy": 0.7216042280197144, + "num_tokens": 368542461.0, + "step": 14565 + }, + { + "epoch": 1.5996046562705908, + "grad_norm": 2.164968252182007, + "learning_rate": 1e-06, + "loss": 0.8477, + "mean_token_accuracy": 0.7306574583053589, + "num_tokens": 368567603.0, + "step": 14566 + }, + { + "epoch": 1.5997144739732043, + "grad_norm": 2.225447416305542, + "learning_rate": 1e-06, + "loss": 0.9453, + "mean_token_accuracy": 0.7115285396575928, + "num_tokens": 368594614.0, + "step": 14567 + }, + { + "epoch": 1.599824291675818, + "grad_norm": 2.18495512008667, + "learning_rate": 1e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.7091807723045349, + "num_tokens": 368621064.0, + "step": 14568 + }, + { + "epoch": 1.5999341093784318, + "grad_norm": 2.177384853363037, + "learning_rate": 1e-06, + "loss": 0.8548, + "mean_token_accuracy": 0.7343397736549377, + "num_tokens": 368646651.0, + "step": 14569 + }, + { + "epoch": 1.6000439270810456, + "grad_norm": 2.234731912612915, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7111685872077942, + "num_tokens": 368673050.0, + "step": 14570 + }, + { + "epoch": 1.6001537447836591, + "grad_norm": 2.2370569705963135, + "learning_rate": 1e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.7201051712036133, + "num_tokens": 368698029.0, + "step": 14571 + }, + { + "epoch": 1.6002635624862727, + "grad_norm": 2.0552191734313965, + "learning_rate": 1e-06, + "loss": 0.9045, + "mean_token_accuracy": 0.7278143763542175, + "num_tokens": 368726285.0, + "step": 14572 + }, + { + "epoch": 1.6003733801888864, + "grad_norm": 2.4320871829986572, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.7004141807556152, + "num_tokens": 368751622.0, + "step": 14573 + }, + { + "epoch": 1.6004831978915002, + "grad_norm": 2.188497304916382, + "learning_rate": 1e-06, + "loss": 0.8275, + "mean_token_accuracy": 0.7392914891242981, + "num_tokens": 368778477.0, + "step": 14574 + }, + { + "epoch": 1.600593015594114, + "grad_norm": 2.5955610275268555, + "learning_rate": 1e-06, + "loss": 0.8742, + "mean_token_accuracy": 0.7295262813568115, + "num_tokens": 368798824.0, + "step": 14575 + }, + { + "epoch": 1.6007028332967275, + "grad_norm": 2.162199020385742, + "learning_rate": 1e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7091482281684875, + "num_tokens": 368825177.0, + "step": 14576 + }, + { + "epoch": 1.600812650999341, + "grad_norm": 2.355512857437134, + "learning_rate": 1e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.7028801441192627, + "num_tokens": 368848421.0, + "step": 14577 + }, + { + "epoch": 1.6009224687019548, + "grad_norm": 2.671792507171631, + "learning_rate": 1e-06, + "loss": 0.7674, + "mean_token_accuracy": 0.7521871328353882, + "num_tokens": 368865553.0, + "step": 14578 + }, + { + "epoch": 1.6010322864045685, + "grad_norm": 2.7162013053894043, + "learning_rate": 1e-06, + "loss": 0.8235, + "mean_token_accuracy": 0.7363878488540649, + "num_tokens": 368884608.0, + "step": 14579 + }, + { + "epoch": 1.601142104107182, + "grad_norm": 2.082913637161255, + "learning_rate": 1e-06, + "loss": 0.8975, + "mean_token_accuracy": 0.721877932548523, + "num_tokens": 368914072.0, + "step": 14580 + }, + { + "epoch": 1.6012519218097956, + "grad_norm": 2.0012102127075195, + "learning_rate": 1e-06, + "loss": 0.8684, + "mean_token_accuracy": 0.7278743982315063, + "num_tokens": 368945756.0, + "step": 14581 + }, + { + "epoch": 1.6013617395124093, + "grad_norm": 2.2318737506866455, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7095935344696045, + "num_tokens": 368974569.0, + "step": 14582 + }, + { + "epoch": 1.601471557215023, + "grad_norm": 2.5580010414123535, + "learning_rate": 1e-06, + "loss": 0.875, + "mean_token_accuracy": 0.7232850790023804, + "num_tokens": 368995995.0, + "step": 14583 + }, + { + "epoch": 1.6015813749176369, + "grad_norm": 2.2122559547424316, + "learning_rate": 1e-06, + "loss": 1.025, + "mean_token_accuracy": 0.6914206743240356, + "num_tokens": 369025923.0, + "step": 14584 + }, + { + "epoch": 1.6016911926202504, + "grad_norm": 2.2948851585388184, + "learning_rate": 1e-06, + "loss": 0.869, + "mean_token_accuracy": 0.7299751043319702, + "num_tokens": 369049699.0, + "step": 14585 + }, + { + "epoch": 1.601801010322864, + "grad_norm": 2.3035669326782227, + "learning_rate": 1e-06, + "loss": 0.9196, + "mean_token_accuracy": 0.7153103947639465, + "num_tokens": 369075760.0, + "step": 14586 + }, + { + "epoch": 1.6019108280254777, + "grad_norm": 2.236318826675415, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.6932868361473083, + "num_tokens": 369101585.0, + "step": 14587 + }, + { + "epoch": 1.6020206457280914, + "grad_norm": 2.4659066200256348, + "learning_rate": 1e-06, + "loss": 0.9045, + "mean_token_accuracy": 0.7130115032196045, + "num_tokens": 369123081.0, + "step": 14588 + }, + { + "epoch": 1.602130463430705, + "grad_norm": 2.2348060607910156, + "learning_rate": 1e-06, + "loss": 1.0189, + "mean_token_accuracy": 0.6881810426712036, + "num_tokens": 369150095.0, + "step": 14589 + }, + { + "epoch": 1.6022402811333187, + "grad_norm": 2.1682074069976807, + "learning_rate": 1e-06, + "loss": 0.8774, + "mean_token_accuracy": 0.7223531007766724, + "num_tokens": 369175056.0, + "step": 14590 + }, + { + "epoch": 1.6023500988359323, + "grad_norm": 2.0351951122283936, + "learning_rate": 1e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.697166919708252, + "num_tokens": 369206140.0, + "step": 14591 + }, + { + "epoch": 1.602459916538546, + "grad_norm": 2.899489402770996, + "learning_rate": 1e-06, + "loss": 0.8679, + "mean_token_accuracy": 0.7294945120811462, + "num_tokens": 369225179.0, + "step": 14592 + }, + { + "epoch": 1.6025697342411598, + "grad_norm": 2.203357219696045, + "learning_rate": 1e-06, + "loss": 0.9596, + "mean_token_accuracy": 0.7105646729469299, + "num_tokens": 369253970.0, + "step": 14593 + }, + { + "epoch": 1.6026795519437733, + "grad_norm": 2.0967085361480713, + "learning_rate": 1e-06, + "loss": 0.9107, + "mean_token_accuracy": 0.7207592725753784, + "num_tokens": 369282573.0, + "step": 14594 + }, + { + "epoch": 1.6027893696463869, + "grad_norm": 2.2731521129608154, + "learning_rate": 1e-06, + "loss": 1.0078, + "mean_token_accuracy": 0.6989747285842896, + "num_tokens": 369310359.0, + "step": 14595 + }, + { + "epoch": 1.6028991873490006, + "grad_norm": 2.08827543258667, + "learning_rate": 1e-06, + "loss": 0.8412, + "mean_token_accuracy": 0.7298381328582764, + "num_tokens": 369337563.0, + "step": 14596 + }, + { + "epoch": 1.6030090050516144, + "grad_norm": 2.041033983230591, + "learning_rate": 1e-06, + "loss": 0.8774, + "mean_token_accuracy": 0.7255862951278687, + "num_tokens": 369366209.0, + "step": 14597 + }, + { + "epoch": 1.6031188227542281, + "grad_norm": 2.1201772689819336, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.7124456167221069, + "num_tokens": 369391767.0, + "step": 14598 + }, + { + "epoch": 1.6032286404568417, + "grad_norm": 2.2594072818756104, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7128056883811951, + "num_tokens": 369418083.0, + "step": 14599 + }, + { + "epoch": 1.6033384581594552, + "grad_norm": 2.1114988327026367, + "learning_rate": 1e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.7027260065078735, + "num_tokens": 369449099.0, + "step": 14600 + }, + { + "epoch": 1.603448275862069, + "grad_norm": 2.079683303833008, + "learning_rate": 1e-06, + "loss": 0.8839, + "mean_token_accuracy": 0.7244622111320496, + "num_tokens": 369476279.0, + "step": 14601 + }, + { + "epoch": 1.6035580935646827, + "grad_norm": 2.372265577316284, + "learning_rate": 1e-06, + "loss": 0.783, + "mean_token_accuracy": 0.757106602191925, + "num_tokens": 369499812.0, + "step": 14602 + }, + { + "epoch": 1.6036679112672962, + "grad_norm": 2.153301954269409, + "learning_rate": 1e-06, + "loss": 0.858, + "mean_token_accuracy": 0.725645899772644, + "num_tokens": 369526363.0, + "step": 14603 + }, + { + "epoch": 1.60377772896991, + "grad_norm": 2.38983154296875, + "learning_rate": 1e-06, + "loss": 0.8414, + "mean_token_accuracy": 0.73639976978302, + "num_tokens": 369548345.0, + "step": 14604 + }, + { + "epoch": 1.6038875466725235, + "grad_norm": 2.3939385414123535, + "learning_rate": 1e-06, + "loss": 0.8056, + "mean_token_accuracy": 0.7464739084243774, + "num_tokens": 369572075.0, + "step": 14605 + }, + { + "epoch": 1.6039973643751373, + "grad_norm": 2.227476119995117, + "learning_rate": 1e-06, + "loss": 0.8494, + "mean_token_accuracy": 0.7326691150665283, + "num_tokens": 369596632.0, + "step": 14606 + }, + { + "epoch": 1.604107182077751, + "grad_norm": 2.19799542427063, + "learning_rate": 1e-06, + "loss": 0.8944, + "mean_token_accuracy": 0.7177310585975647, + "num_tokens": 369623675.0, + "step": 14607 + }, + { + "epoch": 1.6042169997803646, + "grad_norm": 2.3316774368286133, + "learning_rate": 1e-06, + "loss": 0.7934, + "mean_token_accuracy": 0.7467681169509888, + "num_tokens": 369647495.0, + "step": 14608 + }, + { + "epoch": 1.6043268174829781, + "grad_norm": 2.546599864959717, + "learning_rate": 1e-06, + "loss": 0.7681, + "mean_token_accuracy": 0.757309079170227, + "num_tokens": 369667069.0, + "step": 14609 + }, + { + "epoch": 1.6044366351855919, + "grad_norm": 1.9802438020706177, + "learning_rate": 1e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7276468276977539, + "num_tokens": 369699624.0, + "step": 14610 + }, + { + "epoch": 1.6045464528882056, + "grad_norm": 2.147031545639038, + "learning_rate": 1e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.7052679657936096, + "num_tokens": 369729568.0, + "step": 14611 + }, + { + "epoch": 1.6046562705908194, + "grad_norm": 2.1236250400543213, + "learning_rate": 1e-06, + "loss": 0.9541, + "mean_token_accuracy": 0.7033870816230774, + "num_tokens": 369761068.0, + "step": 14612 + }, + { + "epoch": 1.604766088293433, + "grad_norm": 2.3046579360961914, + "learning_rate": 1e-06, + "loss": 0.8708, + "mean_token_accuracy": 0.7327760457992554, + "num_tokens": 369785347.0, + "step": 14613 + }, + { + "epoch": 1.6048759059960465, + "grad_norm": 2.280320405960083, + "learning_rate": 1e-06, + "loss": 0.8653, + "mean_token_accuracy": 0.728878915309906, + "num_tokens": 369809184.0, + "step": 14614 + }, + { + "epoch": 1.6049857236986602, + "grad_norm": 2.170999526977539, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7115391492843628, + "num_tokens": 369837161.0, + "step": 14615 + }, + { + "epoch": 1.605095541401274, + "grad_norm": 2.252643585205078, + "learning_rate": 1e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.7165346145629883, + "num_tokens": 369861430.0, + "step": 14616 + }, + { + "epoch": 1.6052053591038875, + "grad_norm": 2.215533971786499, + "learning_rate": 1e-06, + "loss": 0.9766, + "mean_token_accuracy": 0.7090879082679749, + "num_tokens": 369887354.0, + "step": 14617 + }, + { + "epoch": 1.605315176806501, + "grad_norm": 1.970158338546753, + "learning_rate": 1e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.7174657583236694, + "num_tokens": 369918116.0, + "step": 14618 + }, + { + "epoch": 1.6054249945091148, + "grad_norm": 2.426888942718506, + "learning_rate": 1e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.7251170873641968, + "num_tokens": 369939263.0, + "step": 14619 + }, + { + "epoch": 1.6055348122117286, + "grad_norm": 2.0253658294677734, + "learning_rate": 1e-06, + "loss": 0.9588, + "mean_token_accuracy": 0.7042853832244873, + "num_tokens": 369968670.0, + "step": 14620 + }, + { + "epoch": 1.6056446299143423, + "grad_norm": 2.1798903942108154, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.713758111000061, + "num_tokens": 369995951.0, + "step": 14621 + }, + { + "epoch": 1.6057544476169558, + "grad_norm": 2.152730941772461, + "learning_rate": 1e-06, + "loss": 0.912, + "mean_token_accuracy": 0.7164691090583801, + "num_tokens": 370023704.0, + "step": 14622 + }, + { + "epoch": 1.6058642653195694, + "grad_norm": 2.122293710708618, + "learning_rate": 1e-06, + "loss": 0.8198, + "mean_token_accuracy": 0.746431827545166, + "num_tokens": 370049605.0, + "step": 14623 + }, + { + "epoch": 1.6059740830221831, + "grad_norm": 2.1670689582824707, + "learning_rate": 1e-06, + "loss": 0.959, + "mean_token_accuracy": 0.7015952467918396, + "num_tokens": 370076001.0, + "step": 14624 + }, + { + "epoch": 1.606083900724797, + "grad_norm": 2.6780998706817627, + "learning_rate": 1e-06, + "loss": 0.8344, + "mean_token_accuracy": 0.7383816242218018, + "num_tokens": 370095659.0, + "step": 14625 + }, + { + "epoch": 1.6061937184274107, + "grad_norm": 2.6231720447540283, + "learning_rate": 1e-06, + "loss": 0.8609, + "mean_token_accuracy": 0.7335418462753296, + "num_tokens": 370115045.0, + "step": 14626 + }, + { + "epoch": 1.6063035361300242, + "grad_norm": 2.553607940673828, + "learning_rate": 1e-06, + "loss": 0.792, + "mean_token_accuracy": 0.7460163831710815, + "num_tokens": 370134855.0, + "step": 14627 + }, + { + "epoch": 1.6064133538326377, + "grad_norm": 2.1471567153930664, + "learning_rate": 1e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.7132725715637207, + "num_tokens": 370162445.0, + "step": 14628 + }, + { + "epoch": 1.6065231715352515, + "grad_norm": 2.408493995666504, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7204593420028687, + "num_tokens": 370186173.0, + "step": 14629 + }, + { + "epoch": 1.6066329892378652, + "grad_norm": 2.276843786239624, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.7092748880386353, + "num_tokens": 370213433.0, + "step": 14630 + }, + { + "epoch": 1.6067428069404788, + "grad_norm": 2.164184331893921, + "learning_rate": 1e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.6981412768363953, + "num_tokens": 370243682.0, + "step": 14631 + }, + { + "epoch": 1.6068526246430923, + "grad_norm": 2.211493730545044, + "learning_rate": 1e-06, + "loss": 0.8849, + "mean_token_accuracy": 0.7198873162269592, + "num_tokens": 370268676.0, + "step": 14632 + }, + { + "epoch": 1.606962442345706, + "grad_norm": 2.417349338531494, + "learning_rate": 1e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7135043740272522, + "num_tokens": 370291748.0, + "step": 14633 + }, + { + "epoch": 1.6070722600483198, + "grad_norm": 2.3090460300445557, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7165399193763733, + "num_tokens": 370316572.0, + "step": 14634 + }, + { + "epoch": 1.6071820777509336, + "grad_norm": 2.0598745346069336, + "learning_rate": 1e-06, + "loss": 0.9182, + "mean_token_accuracy": 0.7173365354537964, + "num_tokens": 370347737.0, + "step": 14635 + }, + { + "epoch": 1.6072918954535471, + "grad_norm": 2.2966504096984863, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7094240188598633, + "num_tokens": 370373885.0, + "step": 14636 + }, + { + "epoch": 1.6074017131561606, + "grad_norm": 2.1899540424346924, + "learning_rate": 1e-06, + "loss": 0.8781, + "mean_token_accuracy": 0.7257691621780396, + "num_tokens": 370400077.0, + "step": 14637 + }, + { + "epoch": 1.6075115308587744, + "grad_norm": 2.1035513877868652, + "learning_rate": 1e-06, + "loss": 0.8762, + "mean_token_accuracy": 0.7265851497650146, + "num_tokens": 370429055.0, + "step": 14638 + }, + { + "epoch": 1.6076213485613882, + "grad_norm": 2.079793930053711, + "learning_rate": 1e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.7153140306472778, + "num_tokens": 370458940.0, + "step": 14639 + }, + { + "epoch": 1.607731166264002, + "grad_norm": 2.425865411758423, + "learning_rate": 1e-06, + "loss": 0.9064, + "mean_token_accuracy": 0.711768388748169, + "num_tokens": 370483178.0, + "step": 14640 + }, + { + "epoch": 1.6078409839666155, + "grad_norm": 2.431363582611084, + "learning_rate": 1e-06, + "loss": 0.844, + "mean_token_accuracy": 0.7345540523529053, + "num_tokens": 370504770.0, + "step": 14641 + }, + { + "epoch": 1.607950801669229, + "grad_norm": 1.9420838356018066, + "learning_rate": 1e-06, + "loss": 0.8777, + "mean_token_accuracy": 0.7302736043930054, + "num_tokens": 370536546.0, + "step": 14642 + }, + { + "epoch": 1.6080606193718427, + "grad_norm": 2.004227638244629, + "learning_rate": 1e-06, + "loss": 0.9188, + "mean_token_accuracy": 0.7174346446990967, + "num_tokens": 370568435.0, + "step": 14643 + }, + { + "epoch": 1.6081704370744565, + "grad_norm": 2.2409307956695557, + "learning_rate": 1e-06, + "loss": 0.8979, + "mean_token_accuracy": 0.7205965518951416, + "num_tokens": 370594713.0, + "step": 14644 + }, + { + "epoch": 1.60828025477707, + "grad_norm": 2.5504205226898193, + "learning_rate": 1e-06, + "loss": 0.9195, + "mean_token_accuracy": 0.7120965719223022, + "num_tokens": 370616963.0, + "step": 14645 + }, + { + "epoch": 1.6083900724796836, + "grad_norm": 2.2907752990722656, + "learning_rate": 1e-06, + "loss": 0.8469, + "mean_token_accuracy": 0.7295572757720947, + "num_tokens": 370640635.0, + "step": 14646 + }, + { + "epoch": 1.6084998901822973, + "grad_norm": 2.353630781173706, + "learning_rate": 1e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.7241557836532593, + "num_tokens": 370665184.0, + "step": 14647 + }, + { + "epoch": 1.608609707884911, + "grad_norm": 2.3378524780273438, + "learning_rate": 1e-06, + "loss": 0.8639, + "mean_token_accuracy": 0.7317324876785278, + "num_tokens": 370688602.0, + "step": 14648 + }, + { + "epoch": 1.6087195255875248, + "grad_norm": 2.1950581073760986, + "learning_rate": 1e-06, + "loss": 0.9007, + "mean_token_accuracy": 0.7210405468940735, + "num_tokens": 370714667.0, + "step": 14649 + }, + { + "epoch": 1.6088293432901384, + "grad_norm": 2.1722257137298584, + "learning_rate": 1e-06, + "loss": 0.9202, + "mean_token_accuracy": 0.7144715189933777, + "num_tokens": 370740980.0, + "step": 14650 + }, + { + "epoch": 1.608939160992752, + "grad_norm": 2.329890251159668, + "learning_rate": 1e-06, + "loss": 0.8932, + "mean_token_accuracy": 0.716651976108551, + "num_tokens": 370763510.0, + "step": 14651 + }, + { + "epoch": 1.6090489786953657, + "grad_norm": 2.418287992477417, + "learning_rate": 1e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7228749990463257, + "num_tokens": 370787400.0, + "step": 14652 + }, + { + "epoch": 1.6091587963979794, + "grad_norm": 2.624664545059204, + "learning_rate": 1e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7120002508163452, + "num_tokens": 370808434.0, + "step": 14653 + }, + { + "epoch": 1.609268614100593, + "grad_norm": 2.319462537765503, + "learning_rate": 1e-06, + "loss": 0.9104, + "mean_token_accuracy": 0.7143296003341675, + "num_tokens": 370832203.0, + "step": 14654 + }, + { + "epoch": 1.6093784318032067, + "grad_norm": 2.211656332015991, + "learning_rate": 1e-06, + "loss": 0.9775, + "mean_token_accuracy": 0.7001209259033203, + "num_tokens": 370860346.0, + "step": 14655 + }, + { + "epoch": 1.6094882495058203, + "grad_norm": 2.2469873428344727, + "learning_rate": 1e-06, + "loss": 0.8899, + "mean_token_accuracy": 0.714577853679657, + "num_tokens": 370885933.0, + "step": 14656 + }, + { + "epoch": 1.609598067208434, + "grad_norm": 2.223336935043335, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.70721435546875, + "num_tokens": 370913555.0, + "step": 14657 + }, + { + "epoch": 1.6097078849110478, + "grad_norm": 2.7077651023864746, + "learning_rate": 1e-06, + "loss": 0.8067, + "mean_token_accuracy": 0.7377828359603882, + "num_tokens": 370932313.0, + "step": 14658 + }, + { + "epoch": 1.6098177026136613, + "grad_norm": 2.342102527618408, + "learning_rate": 1e-06, + "loss": 0.875, + "mean_token_accuracy": 0.7238553762435913, + "num_tokens": 370955379.0, + "step": 14659 + }, + { + "epoch": 1.6099275203162748, + "grad_norm": 2.267498016357422, + "learning_rate": 1e-06, + "loss": 0.889, + "mean_token_accuracy": 0.7196118831634521, + "num_tokens": 370980042.0, + "step": 14660 + }, + { + "epoch": 1.6100373380188886, + "grad_norm": 1.9655240774154663, + "learning_rate": 1e-06, + "loss": 0.8616, + "mean_token_accuracy": 0.7311294078826904, + "num_tokens": 371011718.0, + "step": 14661 + }, + { + "epoch": 1.6101471557215024, + "grad_norm": 2.3100688457489014, + "learning_rate": 1e-06, + "loss": 0.8221, + "mean_token_accuracy": 0.743009090423584, + "num_tokens": 371036959.0, + "step": 14662 + }, + { + "epoch": 1.6102569734241161, + "grad_norm": 2.123056650161743, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7068313360214233, + "num_tokens": 371067791.0, + "step": 14663 + }, + { + "epoch": 1.6103667911267296, + "grad_norm": 2.130342483520508, + "learning_rate": 1e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.703720211982727, + "num_tokens": 371097056.0, + "step": 14664 + }, + { + "epoch": 1.6104766088293432, + "grad_norm": 2.7252485752105713, + "learning_rate": 1e-06, + "loss": 0.8975, + "mean_token_accuracy": 0.7238020896911621, + "num_tokens": 371116542.0, + "step": 14665 + }, + { + "epoch": 1.610586426531957, + "grad_norm": 2.7787704467773438, + "learning_rate": 1e-06, + "loss": 0.8911, + "mean_token_accuracy": 0.715261697769165, + "num_tokens": 371134850.0, + "step": 14666 + }, + { + "epoch": 1.6106962442345707, + "grad_norm": 2.139570713043213, + "learning_rate": 1e-06, + "loss": 0.8396, + "mean_token_accuracy": 0.7421821355819702, + "num_tokens": 371163113.0, + "step": 14667 + }, + { + "epoch": 1.6108060619371842, + "grad_norm": 2.2602803707122803, + "learning_rate": 1e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.7219187021255493, + "num_tokens": 371189483.0, + "step": 14668 + }, + { + "epoch": 1.610915879639798, + "grad_norm": 2.100471019744873, + "learning_rate": 1e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.7004953622817993, + "num_tokens": 371218670.0, + "step": 14669 + }, + { + "epoch": 1.6110256973424115, + "grad_norm": 2.2504544258117676, + "learning_rate": 1e-06, + "loss": 0.8885, + "mean_token_accuracy": 0.7262681722640991, + "num_tokens": 371242786.0, + "step": 14670 + }, + { + "epoch": 1.6111355150450253, + "grad_norm": 2.282196283340454, + "learning_rate": 1e-06, + "loss": 0.9327, + "mean_token_accuracy": 0.7090725898742676, + "num_tokens": 371269144.0, + "step": 14671 + }, + { + "epoch": 1.611245332747639, + "grad_norm": 1.9929906129837036, + "learning_rate": 1e-06, + "loss": 0.946, + "mean_token_accuracy": 0.707931399345398, + "num_tokens": 371299074.0, + "step": 14672 + }, + { + "epoch": 1.6113551504502526, + "grad_norm": 2.3244516849517822, + "learning_rate": 1e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.7031382322311401, + "num_tokens": 371324705.0, + "step": 14673 + }, + { + "epoch": 1.611464968152866, + "grad_norm": 2.2919394969940186, + "learning_rate": 1e-06, + "loss": 0.8795, + "mean_token_accuracy": 0.7211775779724121, + "num_tokens": 371350030.0, + "step": 14674 + }, + { + "epoch": 1.6115747858554799, + "grad_norm": 2.424628734588623, + "learning_rate": 1e-06, + "loss": 0.8756, + "mean_token_accuracy": 0.725862443447113, + "num_tokens": 371372559.0, + "step": 14675 + }, + { + "epoch": 1.6116846035580936, + "grad_norm": 2.4108338356018066, + "learning_rate": 1e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7253068685531616, + "num_tokens": 371394185.0, + "step": 14676 + }, + { + "epoch": 1.6117944212607074, + "grad_norm": 1.979837417602539, + "learning_rate": 1e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.7138680219650269, + "num_tokens": 371424800.0, + "step": 14677 + }, + { + "epoch": 1.611904238963321, + "grad_norm": 2.545736312866211, + "learning_rate": 1e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7142850160598755, + "num_tokens": 371445655.0, + "step": 14678 + }, + { + "epoch": 1.6120140566659344, + "grad_norm": 2.282313108444214, + "learning_rate": 1e-06, + "loss": 0.8508, + "mean_token_accuracy": 0.7298364043235779, + "num_tokens": 371469890.0, + "step": 14679 + }, + { + "epoch": 1.6121238743685482, + "grad_norm": 2.6780941486358643, + "learning_rate": 1e-06, + "loss": 0.7965, + "mean_token_accuracy": 0.7416415810585022, + "num_tokens": 371488141.0, + "step": 14680 + }, + { + "epoch": 1.612233692071162, + "grad_norm": 2.504488706588745, + "learning_rate": 1e-06, + "loss": 0.959, + "mean_token_accuracy": 0.7041600346565247, + "num_tokens": 371508933.0, + "step": 14681 + }, + { + "epoch": 1.6123435097737755, + "grad_norm": 2.5362913608551025, + "learning_rate": 1e-06, + "loss": 0.8108, + "mean_token_accuracy": 0.735560417175293, + "num_tokens": 371528787.0, + "step": 14682 + }, + { + "epoch": 1.612453327476389, + "grad_norm": 2.2032878398895264, + "learning_rate": 1e-06, + "loss": 0.8552, + "mean_token_accuracy": 0.7309173941612244, + "num_tokens": 371558453.0, + "step": 14683 + }, + { + "epoch": 1.6125631451790028, + "grad_norm": 2.278705596923828, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7158299684524536, + "num_tokens": 371584852.0, + "step": 14684 + }, + { + "epoch": 1.6126729628816165, + "grad_norm": 2.2572615146636963, + "learning_rate": 1e-06, + "loss": 0.8928, + "mean_token_accuracy": 0.7312324047088623, + "num_tokens": 371611061.0, + "step": 14685 + }, + { + "epoch": 1.6127827805842303, + "grad_norm": 2.330049991607666, + "learning_rate": 1e-06, + "loss": 0.8616, + "mean_token_accuracy": 0.7297582626342773, + "num_tokens": 371635661.0, + "step": 14686 + }, + { + "epoch": 1.6128925982868438, + "grad_norm": 2.0922060012817383, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7177476286888123, + "num_tokens": 371665468.0, + "step": 14687 + }, + { + "epoch": 1.6130024159894574, + "grad_norm": 2.3496780395507812, + "learning_rate": 1e-06, + "loss": 0.8924, + "mean_token_accuracy": 0.7189114689826965, + "num_tokens": 371689509.0, + "step": 14688 + }, + { + "epoch": 1.6131122336920711, + "grad_norm": 2.441169023513794, + "learning_rate": 1e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.7244200706481934, + "num_tokens": 371711236.0, + "step": 14689 + }, + { + "epoch": 1.6132220513946849, + "grad_norm": 2.5365986824035645, + "learning_rate": 1e-06, + "loss": 0.9203, + "mean_token_accuracy": 0.7109292149543762, + "num_tokens": 371732091.0, + "step": 14690 + }, + { + "epoch": 1.6133318690972986, + "grad_norm": 2.475429058074951, + "learning_rate": 1e-06, + "loss": 0.8745, + "mean_token_accuracy": 0.7273979187011719, + "num_tokens": 371754074.0, + "step": 14691 + }, + { + "epoch": 1.6134416867999122, + "grad_norm": 2.315267324447632, + "learning_rate": 1e-06, + "loss": 0.9026, + "mean_token_accuracy": 0.7149485349655151, + "num_tokens": 371779622.0, + "step": 14692 + }, + { + "epoch": 1.6135515045025257, + "grad_norm": 2.2998385429382324, + "learning_rate": 1e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.7164889574050903, + "num_tokens": 371803651.0, + "step": 14693 + }, + { + "epoch": 1.6136613222051395, + "grad_norm": 2.306267499923706, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.715975284576416, + "num_tokens": 371829013.0, + "step": 14694 + }, + { + "epoch": 1.6137711399077532, + "grad_norm": 2.8337786197662354, + "learning_rate": 1e-06, + "loss": 0.8623, + "mean_token_accuracy": 0.7343651056289673, + "num_tokens": 371846548.0, + "step": 14695 + }, + { + "epoch": 1.6138809576103668, + "grad_norm": 2.3798255920410156, + "learning_rate": 1e-06, + "loss": 0.8772, + "mean_token_accuracy": 0.722916305065155, + "num_tokens": 371868422.0, + "step": 14696 + }, + { + "epoch": 1.6139907753129803, + "grad_norm": 2.171982765197754, + "learning_rate": 1e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.7195500135421753, + "num_tokens": 371895453.0, + "step": 14697 + }, + { + "epoch": 1.614100593015594, + "grad_norm": 2.243061065673828, + "learning_rate": 1e-06, + "loss": 0.94, + "mean_token_accuracy": 0.7069894671440125, + "num_tokens": 371920550.0, + "step": 14698 + }, + { + "epoch": 1.6142104107182078, + "grad_norm": 2.7593581676483154, + "learning_rate": 1e-06, + "loss": 0.8574, + "mean_token_accuracy": 0.7296459674835205, + "num_tokens": 371938262.0, + "step": 14699 + }, + { + "epoch": 1.6143202284208216, + "grad_norm": 2.267329216003418, + "learning_rate": 1e-06, + "loss": 0.8927, + "mean_token_accuracy": 0.7198238372802734, + "num_tokens": 371965387.0, + "step": 14700 + }, + { + "epoch": 1.614430046123435, + "grad_norm": 2.2849154472351074, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7105158567428589, + "num_tokens": 371991337.0, + "step": 14701 + }, + { + "epoch": 1.6145398638260486, + "grad_norm": 2.300835609436035, + "learning_rate": 1e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.7191026210784912, + "num_tokens": 372017058.0, + "step": 14702 + }, + { + "epoch": 1.6146496815286624, + "grad_norm": 2.328387498855591, + "learning_rate": 1e-06, + "loss": 0.9033, + "mean_token_accuracy": 0.7185861468315125, + "num_tokens": 372041372.0, + "step": 14703 + }, + { + "epoch": 1.6147594992312762, + "grad_norm": 2.423360586166382, + "learning_rate": 1e-06, + "loss": 0.8766, + "mean_token_accuracy": 0.7206618785858154, + "num_tokens": 372061973.0, + "step": 14704 + }, + { + "epoch": 1.61486931693389, + "grad_norm": 2.2864439487457275, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.70892333984375, + "num_tokens": 372088735.0, + "step": 14705 + }, + { + "epoch": 1.6149791346365034, + "grad_norm": 2.185642719268799, + "learning_rate": 1e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.7255679965019226, + "num_tokens": 372115391.0, + "step": 14706 + }, + { + "epoch": 1.615088952339117, + "grad_norm": 2.3591058254241943, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.7227129340171814, + "num_tokens": 372137608.0, + "step": 14707 + }, + { + "epoch": 1.6151987700417307, + "grad_norm": 2.3548786640167236, + "learning_rate": 1e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.7097452878952026, + "num_tokens": 372161807.0, + "step": 14708 + }, + { + "epoch": 1.6153085877443445, + "grad_norm": 2.0848021507263184, + "learning_rate": 1e-06, + "loss": 0.8394, + "mean_token_accuracy": 0.7367455959320068, + "num_tokens": 372190838.0, + "step": 14709 + }, + { + "epoch": 1.615418405446958, + "grad_norm": 1.904575228691101, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7011680006980896, + "num_tokens": 372224401.0, + "step": 14710 + }, + { + "epoch": 1.6155282231495716, + "grad_norm": 2.021806001663208, + "learning_rate": 1e-06, + "loss": 0.9872, + "mean_token_accuracy": 0.6951966881752014, + "num_tokens": 372256105.0, + "step": 14711 + }, + { + "epoch": 1.6156380408521853, + "grad_norm": 2.1957833766937256, + "learning_rate": 1e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7192606925964355, + "num_tokens": 372281522.0, + "step": 14712 + }, + { + "epoch": 1.615747858554799, + "grad_norm": 2.18857479095459, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.7090219259262085, + "num_tokens": 372306684.0, + "step": 14713 + }, + { + "epoch": 1.6158576762574128, + "grad_norm": 2.391040563583374, + "learning_rate": 1e-06, + "loss": 0.8922, + "mean_token_accuracy": 0.7225574254989624, + "num_tokens": 372331101.0, + "step": 14714 + }, + { + "epoch": 1.6159674939600264, + "grad_norm": 2.8953442573547363, + "learning_rate": 1e-06, + "loss": 0.8054, + "mean_token_accuracy": 0.7443355321884155, + "num_tokens": 372348404.0, + "step": 14715 + }, + { + "epoch": 1.61607731166264, + "grad_norm": 2.3949835300445557, + "learning_rate": 1e-06, + "loss": 0.9042, + "mean_token_accuracy": 0.7167778015136719, + "num_tokens": 372370449.0, + "step": 14716 + }, + { + "epoch": 1.6161871293652537, + "grad_norm": 2.3414976596832275, + "learning_rate": 1e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.7277570366859436, + "num_tokens": 372392894.0, + "step": 14717 + }, + { + "epoch": 1.6162969470678674, + "grad_norm": 2.2514379024505615, + "learning_rate": 1e-06, + "loss": 0.8204, + "mean_token_accuracy": 0.7374616861343384, + "num_tokens": 372416846.0, + "step": 14718 + }, + { + "epoch": 1.616406764770481, + "grad_norm": 2.141136407852173, + "learning_rate": 1e-06, + "loss": 0.9115, + "mean_token_accuracy": 0.7134594917297363, + "num_tokens": 372444815.0, + "step": 14719 + }, + { + "epoch": 1.6165165824730947, + "grad_norm": 2.5454208850860596, + "learning_rate": 1e-06, + "loss": 0.8715, + "mean_token_accuracy": 0.7233861088752747, + "num_tokens": 372467447.0, + "step": 14720 + }, + { + "epoch": 1.6166264001757082, + "grad_norm": 2.3721823692321777, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7101085186004639, + "num_tokens": 372492216.0, + "step": 14721 + }, + { + "epoch": 1.616736217878322, + "grad_norm": 2.400745153427124, + "learning_rate": 1e-06, + "loss": 0.8631, + "mean_token_accuracy": 0.731702446937561, + "num_tokens": 372513860.0, + "step": 14722 + }, + { + "epoch": 1.6168460355809358, + "grad_norm": 2.122847080230713, + "learning_rate": 1e-06, + "loss": 0.88, + "mean_token_accuracy": 0.7294358015060425, + "num_tokens": 372542321.0, + "step": 14723 + }, + { + "epoch": 1.6169558532835493, + "grad_norm": 2.091559648513794, + "learning_rate": 1e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.7088446617126465, + "num_tokens": 372573231.0, + "step": 14724 + }, + { + "epoch": 1.6170656709861628, + "grad_norm": 2.249544858932495, + "learning_rate": 1e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.6895807981491089, + "num_tokens": 372598969.0, + "step": 14725 + }, + { + "epoch": 1.6171754886887766, + "grad_norm": 1.9113404750823975, + "learning_rate": 1e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7164082527160645, + "num_tokens": 372631493.0, + "step": 14726 + }, + { + "epoch": 1.6172853063913903, + "grad_norm": 2.315694808959961, + "learning_rate": 1e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7174832820892334, + "num_tokens": 372656334.0, + "step": 14727 + }, + { + "epoch": 1.617395124094004, + "grad_norm": 2.4274985790252686, + "learning_rate": 1e-06, + "loss": 0.9368, + "mean_token_accuracy": 0.708621084690094, + "num_tokens": 372680855.0, + "step": 14728 + }, + { + "epoch": 1.6175049417966176, + "grad_norm": 2.102069854736328, + "learning_rate": 1e-06, + "loss": 0.9411, + "mean_token_accuracy": 0.7147524952888489, + "num_tokens": 372713037.0, + "step": 14729 + }, + { + "epoch": 1.6176147594992312, + "grad_norm": 2.250760793685913, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7050292491912842, + "num_tokens": 372741909.0, + "step": 14730 + }, + { + "epoch": 1.617724577201845, + "grad_norm": 2.3961596488952637, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.6995662450790405, + "num_tokens": 372770613.0, + "step": 14731 + }, + { + "epoch": 1.6178343949044587, + "grad_norm": 2.3085813522338867, + "learning_rate": 1e-06, + "loss": 0.8505, + "mean_token_accuracy": 0.7331030368804932, + "num_tokens": 372794325.0, + "step": 14732 + }, + { + "epoch": 1.6179442126070722, + "grad_norm": 1.8665889501571655, + "learning_rate": 1e-06, + "loss": 0.9869, + "mean_token_accuracy": 0.6980140209197998, + "num_tokens": 372832599.0, + "step": 14733 + }, + { + "epoch": 1.618054030309686, + "grad_norm": 2.5374338626861572, + "learning_rate": 1e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.710300862789154, + "num_tokens": 372856238.0, + "step": 14734 + }, + { + "epoch": 1.6181638480122995, + "grad_norm": 2.3579719066619873, + "learning_rate": 1e-06, + "loss": 0.8687, + "mean_token_accuracy": 0.7263021469116211, + "num_tokens": 372879994.0, + "step": 14735 + }, + { + "epoch": 1.6182736657149133, + "grad_norm": 2.590517520904541, + "learning_rate": 1e-06, + "loss": 0.7987, + "mean_token_accuracy": 0.7450652122497559, + "num_tokens": 372899900.0, + "step": 14736 + }, + { + "epoch": 1.618383483417527, + "grad_norm": 2.5332436561584473, + "learning_rate": 1e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.7304990291595459, + "num_tokens": 372926108.0, + "step": 14737 + }, + { + "epoch": 1.6184933011201406, + "grad_norm": 2.3129336833953857, + "learning_rate": 1e-06, + "loss": 0.9026, + "mean_token_accuracy": 0.720149576663971, + "num_tokens": 372952793.0, + "step": 14738 + }, + { + "epoch": 1.618603118822754, + "grad_norm": 2.2568767070770264, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.6988826394081116, + "num_tokens": 372977745.0, + "step": 14739 + }, + { + "epoch": 1.6187129365253679, + "grad_norm": 2.462207317352295, + "learning_rate": 1e-06, + "loss": 0.7584, + "mean_token_accuracy": 0.7543869018554688, + "num_tokens": 372998489.0, + "step": 14740 + }, + { + "epoch": 1.6188227542279816, + "grad_norm": 2.3622329235076904, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7279534935951233, + "num_tokens": 373021426.0, + "step": 14741 + }, + { + "epoch": 1.6189325719305954, + "grad_norm": 2.1335740089416504, + "learning_rate": 1e-06, + "loss": 0.9455, + "mean_token_accuracy": 0.7160475254058838, + "num_tokens": 373050650.0, + "step": 14742 + }, + { + "epoch": 1.619042389633209, + "grad_norm": 2.0722768306732178, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.718891978263855, + "num_tokens": 373080027.0, + "step": 14743 + }, + { + "epoch": 1.6191522073358224, + "grad_norm": 2.5319647789001465, + "learning_rate": 1e-06, + "loss": 0.7995, + "mean_token_accuracy": 0.7445829510688782, + "num_tokens": 373100297.0, + "step": 14744 + }, + { + "epoch": 1.6192620250384362, + "grad_norm": 2.2060673236846924, + "learning_rate": 1e-06, + "loss": 0.8488, + "mean_token_accuracy": 0.7294069528579712, + "num_tokens": 373125911.0, + "step": 14745 + }, + { + "epoch": 1.61937184274105, + "grad_norm": 2.2236924171447754, + "learning_rate": 1e-06, + "loss": 0.9358, + "mean_token_accuracy": 0.7157308459281921, + "num_tokens": 373151328.0, + "step": 14746 + }, + { + "epoch": 1.6194816604436635, + "grad_norm": 2.461826801300049, + "learning_rate": 1e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.7251179218292236, + "num_tokens": 373174018.0, + "step": 14747 + }, + { + "epoch": 1.619591478146277, + "grad_norm": 2.1271286010742188, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.6963891983032227, + "num_tokens": 373202154.0, + "step": 14748 + }, + { + "epoch": 1.6197012958488908, + "grad_norm": 2.1638729572296143, + "learning_rate": 1e-06, + "loss": 0.8647, + "mean_token_accuracy": 0.7316645383834839, + "num_tokens": 373229083.0, + "step": 14749 + }, + { + "epoch": 1.6198111135515045, + "grad_norm": 2.3030102252960205, + "learning_rate": 1e-06, + "loss": 0.8552, + "mean_token_accuracy": 0.73653244972229, + "num_tokens": 373252994.0, + "step": 14750 + }, + { + "epoch": 1.6199209312541183, + "grad_norm": 2.2542243003845215, + "learning_rate": 1e-06, + "loss": 0.7839, + "mean_token_accuracy": 0.7515538334846497, + "num_tokens": 373275021.0, + "step": 14751 + }, + { + "epoch": 1.6200307489567318, + "grad_norm": 2.1002912521362305, + "learning_rate": 1e-06, + "loss": 0.8814, + "mean_token_accuracy": 0.72527015209198, + "num_tokens": 373302725.0, + "step": 14752 + }, + { + "epoch": 1.6201405666593454, + "grad_norm": 2.327075242996216, + "learning_rate": 1e-06, + "loss": 0.8447, + "mean_token_accuracy": 0.7347536683082581, + "num_tokens": 373326040.0, + "step": 14753 + }, + { + "epoch": 1.6202503843619591, + "grad_norm": 2.343344211578369, + "learning_rate": 1e-06, + "loss": 0.904, + "mean_token_accuracy": 0.7392643690109253, + "num_tokens": 373347219.0, + "step": 14754 + }, + { + "epoch": 1.6203602020645729, + "grad_norm": 2.1533164978027344, + "learning_rate": 1e-06, + "loss": 0.8993, + "mean_token_accuracy": 0.7236250042915344, + "num_tokens": 373375813.0, + "step": 14755 + }, + { + "epoch": 1.6204700197671866, + "grad_norm": 2.2846786975860596, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.718061625957489, + "num_tokens": 373400510.0, + "step": 14756 + }, + { + "epoch": 1.6205798374698002, + "grad_norm": 2.1045761108398438, + "learning_rate": 1e-06, + "loss": 0.9042, + "mean_token_accuracy": 0.714184045791626, + "num_tokens": 373429232.0, + "step": 14757 + }, + { + "epoch": 1.6206896551724137, + "grad_norm": 2.138354539871216, + "learning_rate": 1e-06, + "loss": 0.9247, + "mean_token_accuracy": 0.7134668231010437, + "num_tokens": 373458984.0, + "step": 14758 + }, + { + "epoch": 1.6207994728750275, + "grad_norm": 2.5075175762176514, + "learning_rate": 1e-06, + "loss": 0.8759, + "mean_token_accuracy": 0.7240816950798035, + "num_tokens": 373479665.0, + "step": 14759 + }, + { + "epoch": 1.6209092905776412, + "grad_norm": 2.117342472076416, + "learning_rate": 1e-06, + "loss": 0.8461, + "mean_token_accuracy": 0.7370126247406006, + "num_tokens": 373508710.0, + "step": 14760 + }, + { + "epoch": 1.6210191082802548, + "grad_norm": 2.3781490325927734, + "learning_rate": 1e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.7164648175239563, + "num_tokens": 373533416.0, + "step": 14761 + }, + { + "epoch": 1.6211289259828683, + "grad_norm": 2.1952414512634277, + "learning_rate": 1e-06, + "loss": 0.8823, + "mean_token_accuracy": 0.7214727401733398, + "num_tokens": 373560807.0, + "step": 14762 + }, + { + "epoch": 1.621238743685482, + "grad_norm": 2.0436086654663086, + "learning_rate": 1e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.7068065404891968, + "num_tokens": 373593443.0, + "step": 14763 + }, + { + "epoch": 1.6213485613880958, + "grad_norm": 2.2267799377441406, + "learning_rate": 1e-06, + "loss": 0.8608, + "mean_token_accuracy": 0.7356491088867188, + "num_tokens": 373618691.0, + "step": 14764 + }, + { + "epoch": 1.6214583790907096, + "grad_norm": 2.1197867393493652, + "learning_rate": 1e-06, + "loss": 0.8978, + "mean_token_accuracy": 0.7233502864837646, + "num_tokens": 373645987.0, + "step": 14765 + }, + { + "epoch": 1.621568196793323, + "grad_norm": 2.4710030555725098, + "learning_rate": 1e-06, + "loss": 0.8914, + "mean_token_accuracy": 0.7150671482086182, + "num_tokens": 373669018.0, + "step": 14766 + }, + { + "epoch": 1.6216780144959366, + "grad_norm": 2.426029920578003, + "learning_rate": 1e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.7162781953811646, + "num_tokens": 373692324.0, + "step": 14767 + }, + { + "epoch": 1.6217878321985504, + "grad_norm": 2.4366836547851562, + "learning_rate": 1e-06, + "loss": 0.84, + "mean_token_accuracy": 0.7375996112823486, + "num_tokens": 373715534.0, + "step": 14768 + }, + { + "epoch": 1.6218976499011641, + "grad_norm": 2.2147858142852783, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7147959470748901, + "num_tokens": 373741796.0, + "step": 14769 + }, + { + "epoch": 1.6220074676037777, + "grad_norm": 2.4496729373931885, + "learning_rate": 1e-06, + "loss": 0.8915, + "mean_token_accuracy": 0.7321208715438843, + "num_tokens": 373765121.0, + "step": 14770 + }, + { + "epoch": 1.6221172853063914, + "grad_norm": 2.040985584259033, + "learning_rate": 1e-06, + "loss": 0.9913, + "mean_token_accuracy": 0.6965800523757935, + "num_tokens": 373796253.0, + "step": 14771 + }, + { + "epoch": 1.622227103009005, + "grad_norm": 2.2241604328155518, + "learning_rate": 1e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.6918060779571533, + "num_tokens": 373825236.0, + "step": 14772 + }, + { + "epoch": 1.6223369207116187, + "grad_norm": 2.6441235542297363, + "learning_rate": 1e-06, + "loss": 0.9217, + "mean_token_accuracy": 0.7111207842826843, + "num_tokens": 373845899.0, + "step": 14773 + }, + { + "epoch": 1.6224467384142325, + "grad_norm": 2.1970162391662598, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7081020474433899, + "num_tokens": 373872254.0, + "step": 14774 + }, + { + "epoch": 1.622556556116846, + "grad_norm": 2.100428342819214, + "learning_rate": 1e-06, + "loss": 0.8451, + "mean_token_accuracy": 0.7395544052124023, + "num_tokens": 373898926.0, + "step": 14775 + }, + { + "epoch": 1.6226663738194596, + "grad_norm": 2.568769693374634, + "learning_rate": 1e-06, + "loss": 0.8605, + "mean_token_accuracy": 0.7254090309143066, + "num_tokens": 373919303.0, + "step": 14776 + }, + { + "epoch": 1.6227761915220733, + "grad_norm": 2.337947368621826, + "learning_rate": 1e-06, + "loss": 0.8889, + "mean_token_accuracy": 0.7256180644035339, + "num_tokens": 373943501.0, + "step": 14777 + }, + { + "epoch": 1.622886009224687, + "grad_norm": 2.273730754852295, + "learning_rate": 1e-06, + "loss": 0.9532, + "mean_token_accuracy": 0.7010771632194519, + "num_tokens": 373970088.0, + "step": 14778 + }, + { + "epoch": 1.6229958269273008, + "grad_norm": 2.033564329147339, + "learning_rate": 1e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.71553635597229, + "num_tokens": 374000542.0, + "step": 14779 + }, + { + "epoch": 1.6231056446299144, + "grad_norm": 2.285435914993286, + "learning_rate": 1e-06, + "loss": 0.9327, + "mean_token_accuracy": 0.7131719589233398, + "num_tokens": 374026469.0, + "step": 14780 + }, + { + "epoch": 1.623215462332528, + "grad_norm": 2.392545223236084, + "learning_rate": 1e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.7139953374862671, + "num_tokens": 374050374.0, + "step": 14781 + }, + { + "epoch": 1.6233252800351416, + "grad_norm": 2.101296901702881, + "learning_rate": 1e-06, + "loss": 0.8739, + "mean_token_accuracy": 0.7257061004638672, + "num_tokens": 374078559.0, + "step": 14782 + }, + { + "epoch": 1.6234350977377554, + "grad_norm": 2.2335610389709473, + "learning_rate": 1e-06, + "loss": 0.9096, + "mean_token_accuracy": 0.7151464819908142, + "num_tokens": 374102804.0, + "step": 14783 + }, + { + "epoch": 1.623544915440369, + "grad_norm": 2.1394057273864746, + "learning_rate": 1e-06, + "loss": 0.884, + "mean_token_accuracy": 0.7324421405792236, + "num_tokens": 374130781.0, + "step": 14784 + }, + { + "epoch": 1.6236547331429827, + "grad_norm": 2.4834916591644287, + "learning_rate": 1e-06, + "loss": 0.8617, + "mean_token_accuracy": 0.7281394600868225, + "num_tokens": 374151853.0, + "step": 14785 + }, + { + "epoch": 1.6237645508455962, + "grad_norm": 1.9344216585159302, + "learning_rate": 1e-06, + "loss": 0.9234, + "mean_token_accuracy": 0.7148551940917969, + "num_tokens": 374183722.0, + "step": 14786 + }, + { + "epoch": 1.62387436854821, + "grad_norm": 2.096027374267578, + "learning_rate": 1e-06, + "loss": 0.8518, + "mean_token_accuracy": 0.7268508672714233, + "num_tokens": 374211618.0, + "step": 14787 + }, + { + "epoch": 1.6239841862508237, + "grad_norm": 2.3255183696746826, + "learning_rate": 1e-06, + "loss": 0.8439, + "mean_token_accuracy": 0.7323427200317383, + "num_tokens": 374234647.0, + "step": 14788 + }, + { + "epoch": 1.6240940039534373, + "grad_norm": 2.2424826622009277, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7109230756759644, + "num_tokens": 374262122.0, + "step": 14789 + }, + { + "epoch": 1.6242038216560508, + "grad_norm": 2.272397041320801, + "learning_rate": 1e-06, + "loss": 0.9479, + "mean_token_accuracy": 0.7101598978042603, + "num_tokens": 374289557.0, + "step": 14790 + }, + { + "epoch": 1.6243136393586646, + "grad_norm": 1.9272468090057373, + "learning_rate": 1e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.7000136375427246, + "num_tokens": 374324042.0, + "step": 14791 + }, + { + "epoch": 1.6244234570612783, + "grad_norm": 2.0434722900390625, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7034223079681396, + "num_tokens": 374355872.0, + "step": 14792 + }, + { + "epoch": 1.624533274763892, + "grad_norm": 1.998247504234314, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7156251668930054, + "num_tokens": 374386501.0, + "step": 14793 + }, + { + "epoch": 1.6246430924665056, + "grad_norm": 2.2822670936584473, + "learning_rate": 1e-06, + "loss": 0.8484, + "mean_token_accuracy": 0.7368414402008057, + "num_tokens": 374411052.0, + "step": 14794 + }, + { + "epoch": 1.6247529101691192, + "grad_norm": 2.268094301223755, + "learning_rate": 1e-06, + "loss": 0.8575, + "mean_token_accuracy": 0.7312183380126953, + "num_tokens": 374435866.0, + "step": 14795 + }, + { + "epoch": 1.624862727871733, + "grad_norm": 2.110574245452881, + "learning_rate": 1e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.6972382068634033, + "num_tokens": 374466123.0, + "step": 14796 + }, + { + "epoch": 1.6249725455743467, + "grad_norm": 2.6210474967956543, + "learning_rate": 1e-06, + "loss": 0.7885, + "mean_token_accuracy": 0.7545595765113831, + "num_tokens": 374484365.0, + "step": 14797 + }, + { + "epoch": 1.6250823632769602, + "grad_norm": 2.2584803104400635, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7058526277542114, + "num_tokens": 374510135.0, + "step": 14798 + }, + { + "epoch": 1.6251921809795737, + "grad_norm": 2.320117950439453, + "learning_rate": 1e-06, + "loss": 0.8399, + "mean_token_accuracy": 0.7339857220649719, + "num_tokens": 374533794.0, + "step": 14799 + }, + { + "epoch": 1.6253019986821875, + "grad_norm": 2.2257373332977295, + "learning_rate": 1e-06, + "loss": 0.865, + "mean_token_accuracy": 0.7234154939651489, + "num_tokens": 374560536.0, + "step": 14800 + }, + { + "epoch": 1.6254118163848013, + "grad_norm": 2.0600507259368896, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.7140474319458008, + "num_tokens": 374591918.0, + "step": 14801 + }, + { + "epoch": 1.625521634087415, + "grad_norm": 2.1690187454223633, + "learning_rate": 1e-06, + "loss": 0.8525, + "mean_token_accuracy": 0.7268129587173462, + "num_tokens": 374617139.0, + "step": 14802 + }, + { + "epoch": 1.6256314517900285, + "grad_norm": 2.2953221797943115, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7121667861938477, + "num_tokens": 374641673.0, + "step": 14803 + }, + { + "epoch": 1.625741269492642, + "grad_norm": 2.351550817489624, + "learning_rate": 1e-06, + "loss": 0.8578, + "mean_token_accuracy": 0.7308694124221802, + "num_tokens": 374664272.0, + "step": 14804 + }, + { + "epoch": 1.6258510871952558, + "grad_norm": 2.217028856277466, + "learning_rate": 1e-06, + "loss": 0.8078, + "mean_token_accuracy": 0.744549036026001, + "num_tokens": 374689505.0, + "step": 14805 + }, + { + "epoch": 1.6259609048978696, + "grad_norm": 1.9795087575912476, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.7045221328735352, + "num_tokens": 374720914.0, + "step": 14806 + }, + { + "epoch": 1.6260707226004834, + "grad_norm": 2.0367789268493652, + "learning_rate": 1e-06, + "loss": 0.8892, + "mean_token_accuracy": 0.7211999297142029, + "num_tokens": 374751329.0, + "step": 14807 + }, + { + "epoch": 1.6261805403030969, + "grad_norm": 2.2956442832946777, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7191451787948608, + "num_tokens": 374778001.0, + "step": 14808 + }, + { + "epoch": 1.6262903580057104, + "grad_norm": 2.211176872253418, + "learning_rate": 1e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.7174506187438965, + "num_tokens": 374804285.0, + "step": 14809 + }, + { + "epoch": 1.6264001757083242, + "grad_norm": 2.237851619720459, + "learning_rate": 1e-06, + "loss": 0.8081, + "mean_token_accuracy": 0.74387526512146, + "num_tokens": 374828230.0, + "step": 14810 + }, + { + "epoch": 1.626509993410938, + "grad_norm": 2.7075560092926025, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7135621309280396, + "num_tokens": 374847628.0, + "step": 14811 + }, + { + "epoch": 1.6266198111135515, + "grad_norm": 2.456613540649414, + "learning_rate": 1e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.716389536857605, + "num_tokens": 374869523.0, + "step": 14812 + }, + { + "epoch": 1.626729628816165, + "grad_norm": 2.3724684715270996, + "learning_rate": 1e-06, + "loss": 0.8353, + "mean_token_accuracy": 0.7419434785842896, + "num_tokens": 374894098.0, + "step": 14813 + }, + { + "epoch": 1.6268394465187788, + "grad_norm": 2.106627941131592, + "learning_rate": 1e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.7208552956581116, + "num_tokens": 374924632.0, + "step": 14814 + }, + { + "epoch": 1.6269492642213925, + "grad_norm": 2.489804267883301, + "learning_rate": 1e-06, + "loss": 0.9041, + "mean_token_accuracy": 0.7108651399612427, + "num_tokens": 374946462.0, + "step": 14815 + }, + { + "epoch": 1.6270590819240063, + "grad_norm": 2.138861894607544, + "learning_rate": 1e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.7248432040214539, + "num_tokens": 374973153.0, + "step": 14816 + }, + { + "epoch": 1.6271688996266198, + "grad_norm": 2.1392908096313477, + "learning_rate": 1e-06, + "loss": 0.887, + "mean_token_accuracy": 0.7138437628746033, + "num_tokens": 375002469.0, + "step": 14817 + }, + { + "epoch": 1.6272787173292333, + "grad_norm": 2.8621673583984375, + "learning_rate": 1e-06, + "loss": 0.8191, + "mean_token_accuracy": 0.7416541576385498, + "num_tokens": 375018505.0, + "step": 14818 + }, + { + "epoch": 1.627388535031847, + "grad_norm": 2.2795767784118652, + "learning_rate": 1e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.7123653888702393, + "num_tokens": 375044057.0, + "step": 14819 + }, + { + "epoch": 1.6274983527344609, + "grad_norm": 2.033992052078247, + "learning_rate": 1e-06, + "loss": 0.9001, + "mean_token_accuracy": 0.7175170183181763, + "num_tokens": 375072823.0, + "step": 14820 + }, + { + "epoch": 1.6276081704370746, + "grad_norm": 2.0761709213256836, + "learning_rate": 1e-06, + "loss": 0.8902, + "mean_token_accuracy": 0.7213517427444458, + "num_tokens": 375101081.0, + "step": 14821 + }, + { + "epoch": 1.6277179881396882, + "grad_norm": 2.2222180366516113, + "learning_rate": 1e-06, + "loss": 0.8108, + "mean_token_accuracy": 0.7493733167648315, + "num_tokens": 375125156.0, + "step": 14822 + }, + { + "epoch": 1.6278278058423017, + "grad_norm": 2.371333599090576, + "learning_rate": 1e-06, + "loss": 0.8291, + "mean_token_accuracy": 0.7412620186805725, + "num_tokens": 375148203.0, + "step": 14823 + }, + { + "epoch": 1.6279376235449154, + "grad_norm": 2.3808200359344482, + "learning_rate": 1e-06, + "loss": 0.863, + "mean_token_accuracy": 0.7313649654388428, + "num_tokens": 375172149.0, + "step": 14824 + }, + { + "epoch": 1.6280474412475292, + "grad_norm": 2.082679510116577, + "learning_rate": 1e-06, + "loss": 0.922, + "mean_token_accuracy": 0.7168436050415039, + "num_tokens": 375202025.0, + "step": 14825 + }, + { + "epoch": 1.6281572589501427, + "grad_norm": 2.313209056854248, + "learning_rate": 1e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7047656178474426, + "num_tokens": 375227542.0, + "step": 14826 + }, + { + "epoch": 1.6282670766527563, + "grad_norm": 2.1275827884674072, + "learning_rate": 1e-06, + "loss": 0.9008, + "mean_token_accuracy": 0.7159186601638794, + "num_tokens": 375257280.0, + "step": 14827 + }, + { + "epoch": 1.62837689435537, + "grad_norm": 2.210989475250244, + "learning_rate": 1e-06, + "loss": 0.9034, + "mean_token_accuracy": 0.7178245186805725, + "num_tokens": 375284083.0, + "step": 14828 + }, + { + "epoch": 1.6284867120579838, + "grad_norm": 2.321113348007202, + "learning_rate": 1e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.7024329900741577, + "num_tokens": 375310377.0, + "step": 14829 + }, + { + "epoch": 1.6285965297605975, + "grad_norm": 2.087385416030884, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7031437158584595, + "num_tokens": 375339316.0, + "step": 14830 + }, + { + "epoch": 1.628706347463211, + "grad_norm": 2.182152032852173, + "learning_rate": 1e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7215781211853027, + "num_tokens": 375366849.0, + "step": 14831 + }, + { + "epoch": 1.6288161651658246, + "grad_norm": 2.2011430263519287, + "learning_rate": 1e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7098875045776367, + "num_tokens": 375392377.0, + "step": 14832 + }, + { + "epoch": 1.6289259828684384, + "grad_norm": 2.4037015438079834, + "learning_rate": 1e-06, + "loss": 0.8725, + "mean_token_accuracy": 0.7280219793319702, + "num_tokens": 375416334.0, + "step": 14833 + }, + { + "epoch": 1.6290358005710521, + "grad_norm": 2.37638521194458, + "learning_rate": 1e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.7158058285713196, + "num_tokens": 375438801.0, + "step": 14834 + }, + { + "epoch": 1.6291456182736657, + "grad_norm": 2.5062386989593506, + "learning_rate": 1e-06, + "loss": 0.8088, + "mean_token_accuracy": 0.7426990270614624, + "num_tokens": 375460513.0, + "step": 14835 + }, + { + "epoch": 1.6292554359762794, + "grad_norm": 2.121701240539551, + "learning_rate": 1e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.7016925811767578, + "num_tokens": 375488554.0, + "step": 14836 + }, + { + "epoch": 1.629365253678893, + "grad_norm": 1.9610856771469116, + "learning_rate": 1e-06, + "loss": 0.9874, + "mean_token_accuracy": 0.691497802734375, + "num_tokens": 375522433.0, + "step": 14837 + }, + { + "epoch": 1.6294750713815067, + "grad_norm": 2.7322304248809814, + "learning_rate": 1e-06, + "loss": 0.8264, + "mean_token_accuracy": 0.7355489730834961, + "num_tokens": 375539776.0, + "step": 14838 + }, + { + "epoch": 1.6295848890841205, + "grad_norm": 2.401616096496582, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.6980801820755005, + "num_tokens": 375564537.0, + "step": 14839 + }, + { + "epoch": 1.629694706786734, + "grad_norm": 2.132254123687744, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7088596820831299, + "num_tokens": 375592677.0, + "step": 14840 + }, + { + "epoch": 1.6298045244893475, + "grad_norm": 2.4432804584503174, + "learning_rate": 1e-06, + "loss": 0.8454, + "mean_token_accuracy": 0.7374051809310913, + "num_tokens": 375614141.0, + "step": 14841 + }, + { + "epoch": 1.6299143421919613, + "grad_norm": 2.347088575363159, + "learning_rate": 1e-06, + "loss": 0.8745, + "mean_token_accuracy": 0.7240140438079834, + "num_tokens": 375637679.0, + "step": 14842 + }, + { + "epoch": 1.630024159894575, + "grad_norm": 2.243126392364502, + "learning_rate": 1e-06, + "loss": 0.9504, + "mean_token_accuracy": 0.6996390223503113, + "num_tokens": 375663780.0, + "step": 14843 + }, + { + "epoch": 1.6301339775971888, + "grad_norm": 2.271171808242798, + "learning_rate": 1e-06, + "loss": 0.9064, + "mean_token_accuracy": 0.7223218679428101, + "num_tokens": 375688831.0, + "step": 14844 + }, + { + "epoch": 1.6302437952998023, + "grad_norm": 2.1565792560577393, + "learning_rate": 1e-06, + "loss": 0.8934, + "mean_token_accuracy": 0.7161574363708496, + "num_tokens": 375716652.0, + "step": 14845 + }, + { + "epoch": 1.6303536130024159, + "grad_norm": 2.247148275375366, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7156920433044434, + "num_tokens": 375745864.0, + "step": 14846 + }, + { + "epoch": 1.6304634307050296, + "grad_norm": 2.208637237548828, + "learning_rate": 1e-06, + "loss": 0.8584, + "mean_token_accuracy": 0.7337064146995544, + "num_tokens": 375773295.0, + "step": 14847 + }, + { + "epoch": 1.6305732484076434, + "grad_norm": 2.263343334197998, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7129012942314148, + "num_tokens": 375799099.0, + "step": 14848 + }, + { + "epoch": 1.630683066110257, + "grad_norm": 2.416240692138672, + "learning_rate": 1e-06, + "loss": 0.8631, + "mean_token_accuracy": 0.7356244325637817, + "num_tokens": 375820824.0, + "step": 14849 + }, + { + "epoch": 1.6307928838128707, + "grad_norm": 2.4107377529144287, + "learning_rate": 1e-06, + "loss": 0.8882, + "mean_token_accuracy": 0.7320197820663452, + "num_tokens": 375846018.0, + "step": 14850 + }, + { + "epoch": 1.6309027015154842, + "grad_norm": 2.3642640113830566, + "learning_rate": 1e-06, + "loss": 0.8414, + "mean_token_accuracy": 0.7363954782485962, + "num_tokens": 375869245.0, + "step": 14851 + }, + { + "epoch": 1.631012519218098, + "grad_norm": 2.23423433303833, + "learning_rate": 1e-06, + "loss": 0.8764, + "mean_token_accuracy": 0.7208292484283447, + "num_tokens": 375895390.0, + "step": 14852 + }, + { + "epoch": 1.6311223369207117, + "grad_norm": 2.411245107650757, + "learning_rate": 1e-06, + "loss": 0.7812, + "mean_token_accuracy": 0.7496057748794556, + "num_tokens": 375915662.0, + "step": 14853 + }, + { + "epoch": 1.6312321546233253, + "grad_norm": 2.628380537033081, + "learning_rate": 1e-06, + "loss": 0.8713, + "mean_token_accuracy": 0.7250602841377258, + "num_tokens": 375935193.0, + "step": 14854 + }, + { + "epoch": 1.6313419723259388, + "grad_norm": 2.0298149585723877, + "learning_rate": 1e-06, + "loss": 0.8475, + "mean_token_accuracy": 0.7309474349021912, + "num_tokens": 375965181.0, + "step": 14855 + }, + { + "epoch": 1.6314517900285526, + "grad_norm": 2.0574052333831787, + "learning_rate": 1e-06, + "loss": 0.9832, + "mean_token_accuracy": 0.716292679309845, + "num_tokens": 375995831.0, + "step": 14856 + }, + { + "epoch": 1.6315616077311663, + "grad_norm": 2.0586726665496826, + "learning_rate": 1e-06, + "loss": 0.7248, + "mean_token_accuracy": 0.765079140663147, + "num_tokens": 376022237.0, + "step": 14857 + }, + { + "epoch": 1.63167142543378, + "grad_norm": 2.54325795173645, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7040959000587463, + "num_tokens": 376044120.0, + "step": 14858 + }, + { + "epoch": 1.6317812431363936, + "grad_norm": 2.3288464546203613, + "learning_rate": 1e-06, + "loss": 0.8687, + "mean_token_accuracy": 0.727554440498352, + "num_tokens": 376068391.0, + "step": 14859 + }, + { + "epoch": 1.6318910608390071, + "grad_norm": 2.2384142875671387, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.7092678546905518, + "num_tokens": 376096990.0, + "step": 14860 + }, + { + "epoch": 1.632000878541621, + "grad_norm": 2.395782947540283, + "learning_rate": 1e-06, + "loss": 0.9182, + "mean_token_accuracy": 0.7124829292297363, + "num_tokens": 376118579.0, + "step": 14861 + }, + { + "epoch": 1.6321106962442347, + "grad_norm": 2.4013094902038574, + "learning_rate": 1e-06, + "loss": 0.7805, + "mean_token_accuracy": 0.7508113980293274, + "num_tokens": 376140932.0, + "step": 14862 + }, + { + "epoch": 1.6322205139468482, + "grad_norm": 2.1132538318634033, + "learning_rate": 1e-06, + "loss": 0.8967, + "mean_token_accuracy": 0.7273136377334595, + "num_tokens": 376169858.0, + "step": 14863 + }, + { + "epoch": 1.6323303316494617, + "grad_norm": 2.125296115875244, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7122710943222046, + "num_tokens": 376197137.0, + "step": 14864 + }, + { + "epoch": 1.6324401493520755, + "grad_norm": 2.3811147212982178, + "learning_rate": 1e-06, + "loss": 0.8814, + "mean_token_accuracy": 0.7322649955749512, + "num_tokens": 376218699.0, + "step": 14865 + }, + { + "epoch": 1.6325499670546892, + "grad_norm": 2.2256152629852295, + "learning_rate": 1e-06, + "loss": 0.9546, + "mean_token_accuracy": 0.6997113823890686, + "num_tokens": 376244513.0, + "step": 14866 + }, + { + "epoch": 1.632659784757303, + "grad_norm": 2.2779171466827393, + "learning_rate": 1e-06, + "loss": 0.8756, + "mean_token_accuracy": 0.724358856678009, + "num_tokens": 376269837.0, + "step": 14867 + }, + { + "epoch": 1.6327696024599165, + "grad_norm": 2.5088694095611572, + "learning_rate": 1e-06, + "loss": 0.8018, + "mean_token_accuracy": 0.7448480129241943, + "num_tokens": 376291231.0, + "step": 14868 + }, + { + "epoch": 1.63287942016253, + "grad_norm": 2.067399263381958, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.6983511447906494, + "num_tokens": 376321626.0, + "step": 14869 + }, + { + "epoch": 1.6329892378651438, + "grad_norm": 2.1445841789245605, + "learning_rate": 1e-06, + "loss": 0.8775, + "mean_token_accuracy": 0.7263944149017334, + "num_tokens": 376347746.0, + "step": 14870 + }, + { + "epoch": 1.6330990555677576, + "grad_norm": 2.4100897312164307, + "learning_rate": 1e-06, + "loss": 0.9874, + "mean_token_accuracy": 0.6916478276252747, + "num_tokens": 376373187.0, + "step": 14871 + }, + { + "epoch": 1.6332088732703713, + "grad_norm": 2.4189093112945557, + "learning_rate": 1e-06, + "loss": 0.9006, + "mean_token_accuracy": 0.7242332696914673, + "num_tokens": 376396228.0, + "step": 14872 + }, + { + "epoch": 1.6333186909729849, + "grad_norm": 2.3807966709136963, + "learning_rate": 1e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.7104138135910034, + "num_tokens": 376422791.0, + "step": 14873 + }, + { + "epoch": 1.6334285086755984, + "grad_norm": 2.43208384513855, + "learning_rate": 1e-06, + "loss": 0.8352, + "mean_token_accuracy": 0.7319596409797668, + "num_tokens": 376443476.0, + "step": 14874 + }, + { + "epoch": 1.6335383263782122, + "grad_norm": 2.3547520637512207, + "learning_rate": 1e-06, + "loss": 0.8701, + "mean_token_accuracy": 0.7311734557151794, + "num_tokens": 376468483.0, + "step": 14875 + }, + { + "epoch": 1.633648144080826, + "grad_norm": 2.1691548824310303, + "learning_rate": 1e-06, + "loss": 0.7818, + "mean_token_accuracy": 0.7550294995307922, + "num_tokens": 376493134.0, + "step": 14876 + }, + { + "epoch": 1.6337579617834395, + "grad_norm": 2.002146005630493, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.7096884250640869, + "num_tokens": 376526270.0, + "step": 14877 + }, + { + "epoch": 1.633867779486053, + "grad_norm": 2.538982391357422, + "learning_rate": 1e-06, + "loss": 0.8529, + "mean_token_accuracy": 0.7314356565475464, + "num_tokens": 376546648.0, + "step": 14878 + }, + { + "epoch": 1.6339775971886668, + "grad_norm": 2.4432289600372314, + "learning_rate": 1e-06, + "loss": 0.8841, + "mean_token_accuracy": 0.7226690053939819, + "num_tokens": 376570354.0, + "step": 14879 + }, + { + "epoch": 1.6340874148912805, + "grad_norm": 2.2185773849487305, + "learning_rate": 1e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.7064376473426819, + "num_tokens": 376596876.0, + "step": 14880 + }, + { + "epoch": 1.6341972325938943, + "grad_norm": 2.6162328720092773, + "learning_rate": 1e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.7098646759986877, + "num_tokens": 376618595.0, + "step": 14881 + }, + { + "epoch": 1.6343070502965078, + "grad_norm": 2.5244650840759277, + "learning_rate": 1e-06, + "loss": 0.8582, + "mean_token_accuracy": 0.7323341369628906, + "num_tokens": 376639596.0, + "step": 14882 + }, + { + "epoch": 1.6344168679991213, + "grad_norm": 2.3014512062072754, + "learning_rate": 1e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.7231912016868591, + "num_tokens": 376665477.0, + "step": 14883 + }, + { + "epoch": 1.634526685701735, + "grad_norm": 2.0797526836395264, + "learning_rate": 1e-06, + "loss": 0.839, + "mean_token_accuracy": 0.7306967973709106, + "num_tokens": 376694846.0, + "step": 14884 + }, + { + "epoch": 1.6346365034043489, + "grad_norm": 2.342980146408081, + "learning_rate": 1e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.7292131781578064, + "num_tokens": 376718862.0, + "step": 14885 + }, + { + "epoch": 1.6347463211069626, + "grad_norm": 2.0351383686065674, + "learning_rate": 1e-06, + "loss": 0.8812, + "mean_token_accuracy": 0.7318228483200073, + "num_tokens": 376749702.0, + "step": 14886 + }, + { + "epoch": 1.6348561388095761, + "grad_norm": 2.3650882244110107, + "learning_rate": 1e-06, + "loss": 0.8447, + "mean_token_accuracy": 0.7281683087348938, + "num_tokens": 376772159.0, + "step": 14887 + }, + { + "epoch": 1.6349659565121897, + "grad_norm": 2.422344446182251, + "learning_rate": 1e-06, + "loss": 0.9188, + "mean_token_accuracy": 0.7261250019073486, + "num_tokens": 376796451.0, + "step": 14888 + }, + { + "epoch": 1.6350757742148034, + "grad_norm": 2.6912965774536133, + "learning_rate": 1e-06, + "loss": 0.8311, + "mean_token_accuracy": 0.7349481582641602, + "num_tokens": 376817707.0, + "step": 14889 + }, + { + "epoch": 1.6351855919174172, + "grad_norm": 2.8792450428009033, + "learning_rate": 1e-06, + "loss": 0.7992, + "mean_token_accuracy": 0.7485381364822388, + "num_tokens": 376834236.0, + "step": 14890 + }, + { + "epoch": 1.6352954096200307, + "grad_norm": 2.4148480892181396, + "learning_rate": 1e-06, + "loss": 0.967, + "mean_token_accuracy": 0.7016593813896179, + "num_tokens": 376858980.0, + "step": 14891 + }, + { + "epoch": 1.6354052273226443, + "grad_norm": 2.457679033279419, + "learning_rate": 1e-06, + "loss": 0.8639, + "mean_token_accuracy": 0.7248964905738831, + "num_tokens": 376880802.0, + "step": 14892 + }, + { + "epoch": 1.635515045025258, + "grad_norm": 2.1262407302856445, + "learning_rate": 1e-06, + "loss": 0.9246, + "mean_token_accuracy": 0.713219404220581, + "num_tokens": 376911977.0, + "step": 14893 + }, + { + "epoch": 1.6356248627278718, + "grad_norm": 2.1982686519622803, + "learning_rate": 1e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.689208984375, + "num_tokens": 376939429.0, + "step": 14894 + }, + { + "epoch": 1.6357346804304855, + "grad_norm": 2.4792025089263916, + "learning_rate": 1e-06, + "loss": 0.8877, + "mean_token_accuracy": 0.7230740785598755, + "num_tokens": 376960208.0, + "step": 14895 + }, + { + "epoch": 1.635844498133099, + "grad_norm": 2.343553066253662, + "learning_rate": 1e-06, + "loss": 0.8179, + "mean_token_accuracy": 0.7355166673660278, + "num_tokens": 376983501.0, + "step": 14896 + }, + { + "epoch": 1.6359543158357126, + "grad_norm": 2.094224452972412, + "learning_rate": 1e-06, + "loss": 0.9446, + "mean_token_accuracy": 0.7127470970153809, + "num_tokens": 377011628.0, + "step": 14897 + }, + { + "epoch": 1.6360641335383264, + "grad_norm": 2.232609987258911, + "learning_rate": 1e-06, + "loss": 0.8729, + "mean_token_accuracy": 0.7315919399261475, + "num_tokens": 377037758.0, + "step": 14898 + }, + { + "epoch": 1.6361739512409401, + "grad_norm": 2.3183975219726562, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.707326352596283, + "num_tokens": 377062182.0, + "step": 14899 + }, + { + "epoch": 1.6362837689435537, + "grad_norm": 2.579378604888916, + "learning_rate": 1e-06, + "loss": 0.939, + "mean_token_accuracy": 0.7219741344451904, + "num_tokens": 377083255.0, + "step": 14900 + }, + { + "epoch": 1.6363935866461674, + "grad_norm": 2.865910530090332, + "learning_rate": 1e-06, + "loss": 0.7669, + "mean_token_accuracy": 0.7528886198997498, + "num_tokens": 377100225.0, + "step": 14901 + }, + { + "epoch": 1.636503404348781, + "grad_norm": 2.402918815612793, + "learning_rate": 1e-06, + "loss": 0.8075, + "mean_token_accuracy": 0.7419813275337219, + "num_tokens": 377124147.0, + "step": 14902 + }, + { + "epoch": 1.6366132220513947, + "grad_norm": 2.047666311264038, + "learning_rate": 1e-06, + "loss": 0.9087, + "mean_token_accuracy": 0.7223731279373169, + "num_tokens": 377152804.0, + "step": 14903 + }, + { + "epoch": 1.6367230397540085, + "grad_norm": 2.2606449127197266, + "learning_rate": 1e-06, + "loss": 0.9073, + "mean_token_accuracy": 0.7197698354721069, + "num_tokens": 377178769.0, + "step": 14904 + }, + { + "epoch": 1.636832857456622, + "grad_norm": 2.248497486114502, + "learning_rate": 1e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.7208468914031982, + "num_tokens": 377207755.0, + "step": 14905 + }, + { + "epoch": 1.6369426751592355, + "grad_norm": 2.100132465362549, + "learning_rate": 1e-06, + "loss": 0.8999, + "mean_token_accuracy": 0.7191842794418335, + "num_tokens": 377235430.0, + "step": 14906 + }, + { + "epoch": 1.6370524928618493, + "grad_norm": 2.748631715774536, + "learning_rate": 1e-06, + "loss": 0.895, + "mean_token_accuracy": 0.7301589250564575, + "num_tokens": 377255854.0, + "step": 14907 + }, + { + "epoch": 1.637162310564463, + "grad_norm": 2.039119243621826, + "learning_rate": 1e-06, + "loss": 0.8655, + "mean_token_accuracy": 0.7308157682418823, + "num_tokens": 377284094.0, + "step": 14908 + }, + { + "epoch": 1.6372721282670768, + "grad_norm": 2.0981531143188477, + "learning_rate": 1e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.6998482942581177, + "num_tokens": 377314000.0, + "step": 14909 + }, + { + "epoch": 1.6373819459696903, + "grad_norm": 2.2163565158843994, + "learning_rate": 1e-06, + "loss": 0.8785, + "mean_token_accuracy": 0.7262611985206604, + "num_tokens": 377338421.0, + "step": 14910 + }, + { + "epoch": 1.6374917636723039, + "grad_norm": 2.2065491676330566, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.6998283863067627, + "num_tokens": 377364195.0, + "step": 14911 + }, + { + "epoch": 1.6376015813749176, + "grad_norm": 2.3400654792785645, + "learning_rate": 1e-06, + "loss": 0.8445, + "mean_token_accuracy": 0.7372626066207886, + "num_tokens": 377387834.0, + "step": 14912 + }, + { + "epoch": 1.6377113990775314, + "grad_norm": 2.3397819995880127, + "learning_rate": 1e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7015087604522705, + "num_tokens": 377412250.0, + "step": 14913 + }, + { + "epoch": 1.637821216780145, + "grad_norm": 2.088785409927368, + "learning_rate": 1e-06, + "loss": 0.848, + "mean_token_accuracy": 0.7346234321594238, + "num_tokens": 377441202.0, + "step": 14914 + }, + { + "epoch": 1.6379310344827587, + "grad_norm": 2.3138458728790283, + "learning_rate": 1e-06, + "loss": 0.9568, + "mean_token_accuracy": 0.701557993888855, + "num_tokens": 377466290.0, + "step": 14915 + }, + { + "epoch": 1.6380408521853722, + "grad_norm": 2.540508508682251, + "learning_rate": 1e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.7127139568328857, + "num_tokens": 377486329.0, + "step": 14916 + }, + { + "epoch": 1.638150669887986, + "grad_norm": 2.266610622406006, + "learning_rate": 1e-06, + "loss": 0.8997, + "mean_token_accuracy": 0.7167313098907471, + "num_tokens": 377511966.0, + "step": 14917 + }, + { + "epoch": 1.6382604875905997, + "grad_norm": 2.534620761871338, + "learning_rate": 1e-06, + "loss": 0.8797, + "mean_token_accuracy": 0.7231931686401367, + "num_tokens": 377533396.0, + "step": 14918 + }, + { + "epoch": 1.6383703052932133, + "grad_norm": 2.4606668949127197, + "learning_rate": 1e-06, + "loss": 0.8274, + "mean_token_accuracy": 0.7410911321640015, + "num_tokens": 377554150.0, + "step": 14919 + }, + { + "epoch": 1.6384801229958268, + "grad_norm": 2.374635934829712, + "learning_rate": 1e-06, + "loss": 0.8719, + "mean_token_accuracy": 0.7240089178085327, + "num_tokens": 377578376.0, + "step": 14920 + }, + { + "epoch": 1.6385899406984406, + "grad_norm": 2.255189895629883, + "learning_rate": 1e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7081288695335388, + "num_tokens": 377605885.0, + "step": 14921 + }, + { + "epoch": 1.6386997584010543, + "grad_norm": 2.1928696632385254, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.718148410320282, + "num_tokens": 377632243.0, + "step": 14922 + }, + { + "epoch": 1.638809576103668, + "grad_norm": 2.237682580947876, + "learning_rate": 1e-06, + "loss": 0.7967, + "mean_token_accuracy": 0.7516659498214722, + "num_tokens": 377655605.0, + "step": 14923 + }, + { + "epoch": 1.6389193938062816, + "grad_norm": 2.0590078830718994, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.7164039611816406, + "num_tokens": 377685138.0, + "step": 14924 + }, + { + "epoch": 1.6390292115088951, + "grad_norm": 2.3523550033569336, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7124313712120056, + "num_tokens": 377710035.0, + "step": 14925 + }, + { + "epoch": 1.639139029211509, + "grad_norm": 2.3526530265808105, + "learning_rate": 1e-06, + "loss": 0.9317, + "mean_token_accuracy": 0.7182053327560425, + "num_tokens": 377734021.0, + "step": 14926 + }, + { + "epoch": 1.6392488469141226, + "grad_norm": 2.3106393814086914, + "learning_rate": 1e-06, + "loss": 0.7768, + "mean_token_accuracy": 0.7520657777786255, + "num_tokens": 377755662.0, + "step": 14927 + }, + { + "epoch": 1.6393586646167362, + "grad_norm": 2.156409978866577, + "learning_rate": 1e-06, + "loss": 0.9943, + "mean_token_accuracy": 0.7030587792396545, + "num_tokens": 377786313.0, + "step": 14928 + }, + { + "epoch": 1.6394684823193497, + "grad_norm": 2.0598833560943604, + "learning_rate": 1e-06, + "loss": 0.8739, + "mean_token_accuracy": 0.7321746349334717, + "num_tokens": 377814490.0, + "step": 14929 + }, + { + "epoch": 1.6395783000219635, + "grad_norm": 2.0664937496185303, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7199519872665405, + "num_tokens": 377844107.0, + "step": 14930 + }, + { + "epoch": 1.6396881177245772, + "grad_norm": 1.9130563735961914, + "learning_rate": 1e-06, + "loss": 0.9191, + "mean_token_accuracy": 0.7152278423309326, + "num_tokens": 377877294.0, + "step": 14931 + }, + { + "epoch": 1.639797935427191, + "grad_norm": 2.2503364086151123, + "learning_rate": 1e-06, + "loss": 0.8386, + "mean_token_accuracy": 0.7359541654586792, + "num_tokens": 377900538.0, + "step": 14932 + }, + { + "epoch": 1.6399077531298045, + "grad_norm": 2.063946485519409, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7059531211853027, + "num_tokens": 377931087.0, + "step": 14933 + }, + { + "epoch": 1.640017570832418, + "grad_norm": 2.4459331035614014, + "learning_rate": 1e-06, + "loss": 0.9085, + "mean_token_accuracy": 0.7168776988983154, + "num_tokens": 377953154.0, + "step": 14934 + }, + { + "epoch": 1.6401273885350318, + "grad_norm": 2.2651560306549072, + "learning_rate": 1e-06, + "loss": 0.7995, + "mean_token_accuracy": 0.7450336217880249, + "num_tokens": 377977528.0, + "step": 14935 + }, + { + "epoch": 1.6402372062376456, + "grad_norm": 2.3430869579315186, + "learning_rate": 1e-06, + "loss": 0.8846, + "mean_token_accuracy": 0.7211312055587769, + "num_tokens": 378000753.0, + "step": 14936 + }, + { + "epoch": 1.6403470239402593, + "grad_norm": 2.2742490768432617, + "learning_rate": 1e-06, + "loss": 0.8242, + "mean_token_accuracy": 0.7382248640060425, + "num_tokens": 378025101.0, + "step": 14937 + }, + { + "epoch": 1.6404568416428729, + "grad_norm": 2.2344040870666504, + "learning_rate": 1e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.71470046043396, + "num_tokens": 378051065.0, + "step": 14938 + }, + { + "epoch": 1.6405666593454864, + "grad_norm": 2.0922811031341553, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.7126026153564453, + "num_tokens": 378081676.0, + "step": 14939 + }, + { + "epoch": 1.6406764770481002, + "grad_norm": 2.185173749923706, + "learning_rate": 1e-06, + "loss": 0.904, + "mean_token_accuracy": 0.7134178280830383, + "num_tokens": 378108171.0, + "step": 14940 + }, + { + "epoch": 1.640786294750714, + "grad_norm": 2.351496696472168, + "learning_rate": 1e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.7150083184242249, + "num_tokens": 378132756.0, + "step": 14941 + }, + { + "epoch": 1.6408961124533274, + "grad_norm": 2.4295880794525146, + "learning_rate": 1e-06, + "loss": 0.8713, + "mean_token_accuracy": 0.7259683012962341, + "num_tokens": 378154387.0, + "step": 14942 + }, + { + "epoch": 1.641005930155941, + "grad_norm": 2.211315155029297, + "learning_rate": 1e-06, + "loss": 0.8769, + "mean_token_accuracy": 0.7280004024505615, + "num_tokens": 378180725.0, + "step": 14943 + }, + { + "epoch": 1.6411157478585547, + "grad_norm": 2.3630664348602295, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.7036466598510742, + "num_tokens": 378204467.0, + "step": 14944 + }, + { + "epoch": 1.6412255655611685, + "grad_norm": 2.2974610328674316, + "learning_rate": 1e-06, + "loss": 0.8919, + "mean_token_accuracy": 0.7131779193878174, + "num_tokens": 378230322.0, + "step": 14945 + }, + { + "epoch": 1.6413353832637823, + "grad_norm": 2.5308499336242676, + "learning_rate": 1e-06, + "loss": 0.8421, + "mean_token_accuracy": 0.7295976281166077, + "num_tokens": 378251801.0, + "step": 14946 + }, + { + "epoch": 1.6414452009663958, + "grad_norm": 2.346785068511963, + "learning_rate": 1e-06, + "loss": 0.8751, + "mean_token_accuracy": 0.7299525737762451, + "num_tokens": 378274445.0, + "step": 14947 + }, + { + "epoch": 1.6415550186690093, + "grad_norm": 2.004321813583374, + "learning_rate": 1e-06, + "loss": 0.8696, + "mean_token_accuracy": 0.720634400844574, + "num_tokens": 378304293.0, + "step": 14948 + }, + { + "epoch": 1.641664836371623, + "grad_norm": 2.2647836208343506, + "learning_rate": 1e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.7166513204574585, + "num_tokens": 378330994.0, + "step": 14949 + }, + { + "epoch": 1.6417746540742368, + "grad_norm": 2.403008460998535, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7099767923355103, + "num_tokens": 378354376.0, + "step": 14950 + }, + { + "epoch": 1.6418844717768504, + "grad_norm": 2.4301936626434326, + "learning_rate": 1e-06, + "loss": 0.8176, + "mean_token_accuracy": 0.7469004988670349, + "num_tokens": 378375448.0, + "step": 14951 + }, + { + "epoch": 1.6419942894794641, + "grad_norm": 1.9916585683822632, + "learning_rate": 1e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.703845739364624, + "num_tokens": 378408260.0, + "step": 14952 + }, + { + "epoch": 1.6421041071820777, + "grad_norm": 2.266369104385376, + "learning_rate": 1e-06, + "loss": 0.9239, + "mean_token_accuracy": 0.7134730815887451, + "num_tokens": 378432792.0, + "step": 14953 + }, + { + "epoch": 1.6422139248846914, + "grad_norm": 2.8097360134124756, + "learning_rate": 1e-06, + "loss": 0.8466, + "mean_token_accuracy": 0.7353968620300293, + "num_tokens": 378450775.0, + "step": 14954 + }, + { + "epoch": 1.6423237425873052, + "grad_norm": 2.186499834060669, + "learning_rate": 1e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.6941856741905212, + "num_tokens": 378479496.0, + "step": 14955 + }, + { + "epoch": 1.6424335602899187, + "grad_norm": 2.224863052368164, + "learning_rate": 1e-06, + "loss": 0.8813, + "mean_token_accuracy": 0.7231416702270508, + "num_tokens": 378506097.0, + "step": 14956 + }, + { + "epoch": 1.6425433779925322, + "grad_norm": 2.8201823234558105, + "learning_rate": 1e-06, + "loss": 0.8158, + "mean_token_accuracy": 0.7434086799621582, + "num_tokens": 378523228.0, + "step": 14957 + }, + { + "epoch": 1.642653195695146, + "grad_norm": 2.507216453552246, + "learning_rate": 1e-06, + "loss": 0.847, + "mean_token_accuracy": 0.738254725933075, + "num_tokens": 378542835.0, + "step": 14958 + }, + { + "epoch": 1.6427630133977598, + "grad_norm": 2.2175064086914062, + "learning_rate": 1e-06, + "loss": 0.8866, + "mean_token_accuracy": 0.7338421940803528, + "num_tokens": 378569979.0, + "step": 14959 + }, + { + "epoch": 1.6428728311003735, + "grad_norm": 2.247349739074707, + "learning_rate": 1e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.7123590111732483, + "num_tokens": 378598561.0, + "step": 14960 + }, + { + "epoch": 1.642982648802987, + "grad_norm": 2.6845619678497314, + "learning_rate": 1e-06, + "loss": 0.8621, + "mean_token_accuracy": 0.7356878519058228, + "num_tokens": 378617702.0, + "step": 14961 + }, + { + "epoch": 1.6430924665056006, + "grad_norm": 2.2428276538848877, + "learning_rate": 1e-06, + "loss": 0.8872, + "mean_token_accuracy": 0.7170661687850952, + "num_tokens": 378642088.0, + "step": 14962 + }, + { + "epoch": 1.6432022842082143, + "grad_norm": 2.270998477935791, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.7165719270706177, + "num_tokens": 378668085.0, + "step": 14963 + }, + { + "epoch": 1.643312101910828, + "grad_norm": 2.0795376300811768, + "learning_rate": 1e-06, + "loss": 1.0273, + "mean_token_accuracy": 0.6861361265182495, + "num_tokens": 378699633.0, + "step": 14964 + }, + { + "epoch": 1.6434219196134416, + "grad_norm": 2.3099188804626465, + "learning_rate": 1e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.7023953795433044, + "num_tokens": 378725207.0, + "step": 14965 + }, + { + "epoch": 1.6435317373160554, + "grad_norm": 2.169419527053833, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7095114588737488, + "num_tokens": 378751658.0, + "step": 14966 + }, + { + "epoch": 1.643641555018669, + "grad_norm": 2.250131130218506, + "learning_rate": 1e-06, + "loss": 0.8109, + "mean_token_accuracy": 0.7424373626708984, + "num_tokens": 378775460.0, + "step": 14967 + }, + { + "epoch": 1.6437513727212827, + "grad_norm": 2.5219218730926514, + "learning_rate": 1e-06, + "loss": 0.8255, + "mean_token_accuracy": 0.7349017858505249, + "num_tokens": 378795702.0, + "step": 14968 + }, + { + "epoch": 1.6438611904238964, + "grad_norm": 2.0593926906585693, + "learning_rate": 1e-06, + "loss": 0.901, + "mean_token_accuracy": 0.7193349599838257, + "num_tokens": 378828318.0, + "step": 14969 + }, + { + "epoch": 1.64397100812651, + "grad_norm": 2.297820806503296, + "learning_rate": 1e-06, + "loss": 0.861, + "mean_token_accuracy": 0.7277095317840576, + "num_tokens": 378851288.0, + "step": 14970 + }, + { + "epoch": 1.6440808258291235, + "grad_norm": 2.2157087326049805, + "learning_rate": 1e-06, + "loss": 0.9544, + "mean_token_accuracy": 0.7030866146087646, + "num_tokens": 378880470.0, + "step": 14971 + }, + { + "epoch": 1.6441906435317373, + "grad_norm": 2.3521480560302734, + "learning_rate": 1e-06, + "loss": 0.8752, + "mean_token_accuracy": 0.7315318584442139, + "num_tokens": 378905088.0, + "step": 14972 + }, + { + "epoch": 1.644300461234351, + "grad_norm": 2.494600296020508, + "learning_rate": 1e-06, + "loss": 0.8283, + "mean_token_accuracy": 0.741369366645813, + "num_tokens": 378926104.0, + "step": 14973 + }, + { + "epoch": 1.6444102789369648, + "grad_norm": 2.181335210800171, + "learning_rate": 1e-06, + "loss": 0.8905, + "mean_token_accuracy": 0.720125675201416, + "num_tokens": 378952849.0, + "step": 14974 + }, + { + "epoch": 1.6445200966395783, + "grad_norm": 2.26835298538208, + "learning_rate": 1e-06, + "loss": 0.8232, + "mean_token_accuracy": 0.7410078644752502, + "num_tokens": 378978586.0, + "step": 14975 + }, + { + "epoch": 1.6446299143421919, + "grad_norm": 2.6339497566223145, + "learning_rate": 1e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.7214824557304382, + "num_tokens": 378998355.0, + "step": 14976 + }, + { + "epoch": 1.6447397320448056, + "grad_norm": 2.3057873249053955, + "learning_rate": 1e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.7057216167449951, + "num_tokens": 379023424.0, + "step": 14977 + }, + { + "epoch": 1.6448495497474194, + "grad_norm": 2.2027597427368164, + "learning_rate": 1e-06, + "loss": 0.9107, + "mean_token_accuracy": 0.7153503894805908, + "num_tokens": 379049592.0, + "step": 14978 + }, + { + "epoch": 1.644959367450033, + "grad_norm": 2.0294106006622314, + "learning_rate": 1e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.7124237418174744, + "num_tokens": 379081106.0, + "step": 14979 + }, + { + "epoch": 1.6450691851526464, + "grad_norm": 2.2829394340515137, + "learning_rate": 1e-06, + "loss": 0.8269, + "mean_token_accuracy": 0.7346475720405579, + "num_tokens": 379105920.0, + "step": 14980 + }, + { + "epoch": 1.6451790028552602, + "grad_norm": 2.4686501026153564, + "learning_rate": 1e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.7170332670211792, + "num_tokens": 379129635.0, + "step": 14981 + }, + { + "epoch": 1.645288820557874, + "grad_norm": 2.2375552654266357, + "learning_rate": 1e-06, + "loss": 0.8846, + "mean_token_accuracy": 0.7222725749015808, + "num_tokens": 379156171.0, + "step": 14982 + }, + { + "epoch": 1.6453986382604877, + "grad_norm": 2.2759220600128174, + "learning_rate": 1e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.7215797901153564, + "num_tokens": 379181790.0, + "step": 14983 + }, + { + "epoch": 1.6455084559631012, + "grad_norm": 2.158086061477661, + "learning_rate": 1e-06, + "loss": 1.007, + "mean_token_accuracy": 0.6929931640625, + "num_tokens": 379211662.0, + "step": 14984 + }, + { + "epoch": 1.6456182736657148, + "grad_norm": 2.3612377643585205, + "learning_rate": 1e-06, + "loss": 0.8472, + "mean_token_accuracy": 0.7426121234893799, + "num_tokens": 379235980.0, + "step": 14985 + }, + { + "epoch": 1.6457280913683285, + "grad_norm": 1.9483301639556885, + "learning_rate": 1e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7021749019622803, + "num_tokens": 379268646.0, + "step": 14986 + }, + { + "epoch": 1.6458379090709423, + "grad_norm": 2.2109601497650146, + "learning_rate": 1e-06, + "loss": 1.0203, + "mean_token_accuracy": 0.6894357204437256, + "num_tokens": 379295830.0, + "step": 14987 + }, + { + "epoch": 1.645947726773556, + "grad_norm": 2.2443909645080566, + "learning_rate": 1e-06, + "loss": 0.8874, + "mean_token_accuracy": 0.7220847010612488, + "num_tokens": 379321287.0, + "step": 14988 + }, + { + "epoch": 1.6460575444761696, + "grad_norm": 2.5171828269958496, + "learning_rate": 1e-06, + "loss": 0.879, + "mean_token_accuracy": 0.7264484167098999, + "num_tokens": 379343673.0, + "step": 14989 + }, + { + "epoch": 1.6461673621787831, + "grad_norm": 2.410681962966919, + "learning_rate": 1e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.6993130445480347, + "num_tokens": 379367965.0, + "step": 14990 + }, + { + "epoch": 1.6462771798813969, + "grad_norm": 2.1963253021240234, + "learning_rate": 1e-06, + "loss": 0.7782, + "mean_token_accuracy": 0.7535133957862854, + "num_tokens": 379391801.0, + "step": 14991 + }, + { + "epoch": 1.6463869975840106, + "grad_norm": 2.1967484951019287, + "learning_rate": 1e-06, + "loss": 0.9166, + "mean_token_accuracy": 0.7120243310928345, + "num_tokens": 379420852.0, + "step": 14992 + }, + { + "epoch": 1.6464968152866242, + "grad_norm": 2.400930404663086, + "learning_rate": 1e-06, + "loss": 0.8015, + "mean_token_accuracy": 0.7427430152893066, + "num_tokens": 379441973.0, + "step": 14993 + }, + { + "epoch": 1.6466066329892377, + "grad_norm": 2.350614547729492, + "learning_rate": 1e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.7214192152023315, + "num_tokens": 379467185.0, + "step": 14994 + }, + { + "epoch": 1.6467164506918515, + "grad_norm": 2.064049482345581, + "learning_rate": 1e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.7008993625640869, + "num_tokens": 379496281.0, + "step": 14995 + }, + { + "epoch": 1.6468262683944652, + "grad_norm": 2.4402577877044678, + "learning_rate": 1e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.7174110412597656, + "num_tokens": 379518063.0, + "step": 14996 + }, + { + "epoch": 1.646936086097079, + "grad_norm": 2.2986600399017334, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7079403400421143, + "num_tokens": 379544797.0, + "step": 14997 + }, + { + "epoch": 1.6470459037996925, + "grad_norm": 2.2300312519073486, + "learning_rate": 1e-06, + "loss": 1.0024, + "mean_token_accuracy": 0.6975467801094055, + "num_tokens": 379570826.0, + "step": 14998 + }, + { + "epoch": 1.647155721502306, + "grad_norm": 2.341801166534424, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7283033132553101, + "num_tokens": 379596691.0, + "step": 14999 + }, + { + "epoch": 1.6472655392049198, + "grad_norm": 2.4749836921691895, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.7061697840690613, + "num_tokens": 379619966.0, + "step": 15000 + }, + { + "epoch": 1.6473753569075336, + "grad_norm": 2.120673656463623, + "learning_rate": 1e-06, + "loss": 0.7152, + "mean_token_accuracy": 0.7668395042419434, + "num_tokens": 379643679.0, + "step": 15001 + }, + { + "epoch": 1.6474851746101473, + "grad_norm": 2.2099485397338867, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.6925606727600098, + "num_tokens": 379668546.0, + "step": 15002 + }, + { + "epoch": 1.6475949923127609, + "grad_norm": 2.276118040084839, + "learning_rate": 1e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.7109553217887878, + "num_tokens": 379693548.0, + "step": 15003 + }, + { + "epoch": 1.6477048100153744, + "grad_norm": 2.5002310276031494, + "learning_rate": 1e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.7133118510246277, + "num_tokens": 379716265.0, + "step": 15004 + }, + { + "epoch": 1.6478146277179881, + "grad_norm": 2.2683098316192627, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7137988805770874, + "num_tokens": 379741802.0, + "step": 15005 + }, + { + "epoch": 1.647924445420602, + "grad_norm": 2.516745090484619, + "learning_rate": 1e-06, + "loss": 0.8324, + "mean_token_accuracy": 0.7362489700317383, + "num_tokens": 379762390.0, + "step": 15006 + }, + { + "epoch": 1.6480342631232154, + "grad_norm": 2.2805566787719727, + "learning_rate": 1e-06, + "loss": 0.8441, + "mean_token_accuracy": 0.729831874370575, + "num_tokens": 379786499.0, + "step": 15007 + }, + { + "epoch": 1.648144080825829, + "grad_norm": 2.4210572242736816, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7212357521057129, + "num_tokens": 379810272.0, + "step": 15008 + }, + { + "epoch": 1.6482538985284427, + "grad_norm": 2.1486072540283203, + "learning_rate": 1e-06, + "loss": 0.8849, + "mean_token_accuracy": 0.7207990884780884, + "num_tokens": 379839236.0, + "step": 15009 + }, + { + "epoch": 1.6483637162310565, + "grad_norm": 2.109473943710327, + "learning_rate": 1e-06, + "loss": 1.0194, + "mean_token_accuracy": 0.6923792958259583, + "num_tokens": 379869614.0, + "step": 15010 + }, + { + "epoch": 1.6484735339336702, + "grad_norm": 2.4791312217712402, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7139467000961304, + "num_tokens": 379892308.0, + "step": 15011 + }, + { + "epoch": 1.6485833516362838, + "grad_norm": 2.0161726474761963, + "learning_rate": 1e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.6921214461326599, + "num_tokens": 379925371.0, + "step": 15012 + }, + { + "epoch": 1.6486931693388973, + "grad_norm": 2.046785354614258, + "learning_rate": 1e-06, + "loss": 0.916, + "mean_token_accuracy": 0.7228661775588989, + "num_tokens": 379955914.0, + "step": 15013 + }, + { + "epoch": 1.648802987041511, + "grad_norm": 2.409079074859619, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7089242935180664, + "num_tokens": 379979017.0, + "step": 15014 + }, + { + "epoch": 1.6489128047441248, + "grad_norm": 2.2688684463500977, + "learning_rate": 1e-06, + "loss": 0.8518, + "mean_token_accuracy": 0.7341244220733643, + "num_tokens": 380005166.0, + "step": 15015 + }, + { + "epoch": 1.6490226224467384, + "grad_norm": 2.352185010910034, + "learning_rate": 1e-06, + "loss": 0.8915, + "mean_token_accuracy": 0.7283429503440857, + "num_tokens": 380029896.0, + "step": 15016 + }, + { + "epoch": 1.6491324401493521, + "grad_norm": 2.1848676204681396, + "learning_rate": 1e-06, + "loss": 0.9104, + "mean_token_accuracy": 0.7190698385238647, + "num_tokens": 380057305.0, + "step": 15017 + }, + { + "epoch": 1.6492422578519657, + "grad_norm": 2.210845470428467, + "learning_rate": 1e-06, + "loss": 0.7738, + "mean_token_accuracy": 0.7554388046264648, + "num_tokens": 380081813.0, + "step": 15018 + }, + { + "epoch": 1.6493520755545794, + "grad_norm": 1.9973838329315186, + "learning_rate": 1e-06, + "loss": 0.9137, + "mean_token_accuracy": 0.7165154218673706, + "num_tokens": 380111636.0, + "step": 15019 + }, + { + "epoch": 1.6494618932571932, + "grad_norm": 2.1501171588897705, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.705826997756958, + "num_tokens": 380140786.0, + "step": 15020 + }, + { + "epoch": 1.6495717109598067, + "grad_norm": 2.084577798843384, + "learning_rate": 1e-06, + "loss": 0.9098, + "mean_token_accuracy": 0.7205891609191895, + "num_tokens": 380171237.0, + "step": 15021 + }, + { + "epoch": 1.6496815286624202, + "grad_norm": 1.819092869758606, + "learning_rate": 1e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.7016703486442566, + "num_tokens": 380208387.0, + "step": 15022 + }, + { + "epoch": 1.649791346365034, + "grad_norm": 2.4682254791259766, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7178334593772888, + "num_tokens": 380231776.0, + "step": 15023 + }, + { + "epoch": 1.6499011640676478, + "grad_norm": 2.160412073135376, + "learning_rate": 1e-06, + "loss": 0.8241, + "mean_token_accuracy": 0.7328264117240906, + "num_tokens": 380257925.0, + "step": 15024 + }, + { + "epoch": 1.6500109817702615, + "grad_norm": 2.2052879333496094, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7092766761779785, + "num_tokens": 380284546.0, + "step": 15025 + }, + { + "epoch": 1.650120799472875, + "grad_norm": 2.223984479904175, + "learning_rate": 1e-06, + "loss": 0.8767, + "mean_token_accuracy": 0.7224245071411133, + "num_tokens": 380312616.0, + "step": 15026 + }, + { + "epoch": 1.6502306171754886, + "grad_norm": 2.282700300216675, + "learning_rate": 1e-06, + "loss": 0.9454, + "mean_token_accuracy": 0.7036082744598389, + "num_tokens": 380338719.0, + "step": 15027 + }, + { + "epoch": 1.6503404348781023, + "grad_norm": 2.047592878341675, + "learning_rate": 1e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.7083227038383484, + "num_tokens": 380370958.0, + "step": 15028 + }, + { + "epoch": 1.650450252580716, + "grad_norm": 2.327913761138916, + "learning_rate": 1e-06, + "loss": 0.9628, + "mean_token_accuracy": 0.7017163038253784, + "num_tokens": 380396689.0, + "step": 15029 + }, + { + "epoch": 1.6505600702833296, + "grad_norm": 2.2255797386169434, + "learning_rate": 1e-06, + "loss": 0.8068, + "mean_token_accuracy": 0.7537894248962402, + "num_tokens": 380421260.0, + "step": 15030 + }, + { + "epoch": 1.6506698879859434, + "grad_norm": 2.2274136543273926, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7123675346374512, + "num_tokens": 380447314.0, + "step": 15031 + }, + { + "epoch": 1.650779705688557, + "grad_norm": 2.1978015899658203, + "learning_rate": 1e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7177180647850037, + "num_tokens": 380474528.0, + "step": 15032 + }, + { + "epoch": 1.6508895233911707, + "grad_norm": 2.1557118892669678, + "learning_rate": 1e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.7205793261528015, + "num_tokens": 380500027.0, + "step": 15033 + }, + { + "epoch": 1.6509993410937844, + "grad_norm": 2.2942545413970947, + "learning_rate": 1e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.726824939250946, + "num_tokens": 380527702.0, + "step": 15034 + }, + { + "epoch": 1.651109158796398, + "grad_norm": 2.264622926712036, + "learning_rate": 1e-06, + "loss": 0.8292, + "mean_token_accuracy": 0.7332108616828918, + "num_tokens": 380552779.0, + "step": 15035 + }, + { + "epoch": 1.6512189764990115, + "grad_norm": 2.2761805057525635, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7220932245254517, + "num_tokens": 380577588.0, + "step": 15036 + }, + { + "epoch": 1.6513287942016253, + "grad_norm": 2.3010871410369873, + "learning_rate": 1e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.7388845682144165, + "num_tokens": 380600359.0, + "step": 15037 + }, + { + "epoch": 1.651438611904239, + "grad_norm": 2.4546613693237305, + "learning_rate": 1e-06, + "loss": 0.8897, + "mean_token_accuracy": 0.7215287685394287, + "num_tokens": 380622901.0, + "step": 15038 + }, + { + "epoch": 1.6515484296068528, + "grad_norm": 2.372720718383789, + "learning_rate": 1e-06, + "loss": 0.9837, + "mean_token_accuracy": 0.6984524726867676, + "num_tokens": 380647524.0, + "step": 15039 + }, + { + "epoch": 1.6516582473094663, + "grad_norm": 2.3124141693115234, + "learning_rate": 1e-06, + "loss": 0.8422, + "mean_token_accuracy": 0.7282301187515259, + "num_tokens": 380671166.0, + "step": 15040 + }, + { + "epoch": 1.6517680650120798, + "grad_norm": 2.1450307369232178, + "learning_rate": 1e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.7157270908355713, + "num_tokens": 380699209.0, + "step": 15041 + }, + { + "epoch": 1.6518778827146936, + "grad_norm": 2.0924623012542725, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7183815240859985, + "num_tokens": 380728254.0, + "step": 15042 + }, + { + "epoch": 1.6519877004173074, + "grad_norm": 1.9520087242126465, + "learning_rate": 1e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.6930872201919556, + "num_tokens": 380759252.0, + "step": 15043 + }, + { + "epoch": 1.652097518119921, + "grad_norm": 2.09024715423584, + "learning_rate": 1e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.7118634581565857, + "num_tokens": 380789575.0, + "step": 15044 + }, + { + "epoch": 1.6522073358225344, + "grad_norm": 2.4220805168151855, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7136334776878357, + "num_tokens": 380811881.0, + "step": 15045 + }, + { + "epoch": 1.6523171535251482, + "grad_norm": 2.511117935180664, + "learning_rate": 1e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.7166365385055542, + "num_tokens": 380834010.0, + "step": 15046 + }, + { + "epoch": 1.652426971227762, + "grad_norm": 2.363041639328003, + "learning_rate": 1e-06, + "loss": 0.8977, + "mean_token_accuracy": 0.7181721925735474, + "num_tokens": 380857265.0, + "step": 15047 + }, + { + "epoch": 1.6525367889303757, + "grad_norm": 2.1308913230895996, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7254327535629272, + "num_tokens": 380885153.0, + "step": 15048 + }, + { + "epoch": 1.6526466066329892, + "grad_norm": 2.167304277420044, + "learning_rate": 1e-06, + "loss": 0.9697, + "mean_token_accuracy": 0.703991711139679, + "num_tokens": 380912174.0, + "step": 15049 + }, + { + "epoch": 1.6527564243356028, + "grad_norm": 2.27643084526062, + "learning_rate": 1e-06, + "loss": 0.9036, + "mean_token_accuracy": 0.7266225814819336, + "num_tokens": 380937666.0, + "step": 15050 + }, + { + "epoch": 1.6528662420382165, + "grad_norm": 2.5627079010009766, + "learning_rate": 1e-06, + "loss": 0.8673, + "mean_token_accuracy": 0.7235491275787354, + "num_tokens": 380957453.0, + "step": 15051 + }, + { + "epoch": 1.6529760597408303, + "grad_norm": 2.3877296447753906, + "learning_rate": 1e-06, + "loss": 0.8546, + "mean_token_accuracy": 0.7302873730659485, + "num_tokens": 380978629.0, + "step": 15052 + }, + { + "epoch": 1.653085877443444, + "grad_norm": 2.3150408267974854, + "learning_rate": 1e-06, + "loss": 0.9008, + "mean_token_accuracy": 0.7292439341545105, + "num_tokens": 381003044.0, + "step": 15053 + }, + { + "epoch": 1.6531956951460576, + "grad_norm": 2.4709651470184326, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7094768285751343, + "num_tokens": 381025045.0, + "step": 15054 + }, + { + "epoch": 1.653305512848671, + "grad_norm": 2.630833864212036, + "learning_rate": 1e-06, + "loss": 0.8293, + "mean_token_accuracy": 0.7347027063369751, + "num_tokens": 381043871.0, + "step": 15055 + }, + { + "epoch": 1.6534153305512849, + "grad_norm": 2.31946063041687, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.6995655298233032, + "num_tokens": 381068489.0, + "step": 15056 + }, + { + "epoch": 1.6535251482538986, + "grad_norm": 2.4050984382629395, + "learning_rate": 1e-06, + "loss": 0.7852, + "mean_token_accuracy": 0.7455335259437561, + "num_tokens": 381090429.0, + "step": 15057 + }, + { + "epoch": 1.6536349659565122, + "grad_norm": 2.373202085494995, + "learning_rate": 1e-06, + "loss": 0.8927, + "mean_token_accuracy": 0.7249783277511597, + "num_tokens": 381113949.0, + "step": 15058 + }, + { + "epoch": 1.6537447836591257, + "grad_norm": 2.1456847190856934, + "learning_rate": 1e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7176908850669861, + "num_tokens": 381139568.0, + "step": 15059 + }, + { + "epoch": 1.6538546013617395, + "grad_norm": 2.487687826156616, + "learning_rate": 1e-06, + "loss": 0.8201, + "mean_token_accuracy": 0.7356336712837219, + "num_tokens": 381160631.0, + "step": 15060 + }, + { + "epoch": 1.6539644190643532, + "grad_norm": 2.3379087448120117, + "learning_rate": 1e-06, + "loss": 0.8026, + "mean_token_accuracy": 0.7558712959289551, + "num_tokens": 381181729.0, + "step": 15061 + }, + { + "epoch": 1.654074236766967, + "grad_norm": 2.405648708343506, + "learning_rate": 1e-06, + "loss": 0.9362, + "mean_token_accuracy": 0.7075337171554565, + "num_tokens": 381205517.0, + "step": 15062 + }, + { + "epoch": 1.6541840544695805, + "grad_norm": 2.077679395675659, + "learning_rate": 1e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.7018315196037292, + "num_tokens": 381234944.0, + "step": 15063 + }, + { + "epoch": 1.654293872172194, + "grad_norm": 2.2168619632720947, + "learning_rate": 1e-06, + "loss": 0.912, + "mean_token_accuracy": 0.7223036289215088, + "num_tokens": 381261543.0, + "step": 15064 + }, + { + "epoch": 1.6544036898748078, + "grad_norm": 2.365384101867676, + "learning_rate": 1e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.7278084754943848, + "num_tokens": 381286715.0, + "step": 15065 + }, + { + "epoch": 1.6545135075774215, + "grad_norm": 2.291590690612793, + "learning_rate": 1e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.7284632921218872, + "num_tokens": 381311900.0, + "step": 15066 + }, + { + "epoch": 1.6546233252800353, + "grad_norm": 2.3423244953155518, + "learning_rate": 1e-06, + "loss": 0.894, + "mean_token_accuracy": 0.7271015048027039, + "num_tokens": 381337461.0, + "step": 15067 + }, + { + "epoch": 1.6547331429826488, + "grad_norm": 2.414484739303589, + "learning_rate": 1e-06, + "loss": 0.8258, + "mean_token_accuracy": 0.741519033908844, + "num_tokens": 381361402.0, + "step": 15068 + }, + { + "epoch": 1.6548429606852624, + "grad_norm": 2.5030884742736816, + "learning_rate": 1e-06, + "loss": 0.86, + "mean_token_accuracy": 0.7297159433364868, + "num_tokens": 381381865.0, + "step": 15069 + }, + { + "epoch": 1.6549527783878761, + "grad_norm": 2.414625883102417, + "learning_rate": 1e-06, + "loss": 0.8684, + "mean_token_accuracy": 0.7292795777320862, + "num_tokens": 381405946.0, + "step": 15070 + }, + { + "epoch": 1.65506259609049, + "grad_norm": 2.0015904903411865, + "learning_rate": 1e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.7339381575584412, + "num_tokens": 381435136.0, + "step": 15071 + }, + { + "epoch": 1.6551724137931034, + "grad_norm": 2.154707193374634, + "learning_rate": 1e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7215551137924194, + "num_tokens": 381461739.0, + "step": 15072 + }, + { + "epoch": 1.655282231495717, + "grad_norm": 2.6497795581817627, + "learning_rate": 1e-06, + "loss": 0.8575, + "mean_token_accuracy": 0.7297940850257874, + "num_tokens": 381480336.0, + "step": 15073 + }, + { + "epoch": 1.6553920491983307, + "grad_norm": 2.8114662170410156, + "learning_rate": 1e-06, + "loss": 0.8779, + "mean_token_accuracy": 0.7259635329246521, + "num_tokens": 381497608.0, + "step": 15074 + }, + { + "epoch": 1.6555018669009445, + "grad_norm": 1.9914209842681885, + "learning_rate": 1e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.7021621465682983, + "num_tokens": 381527871.0, + "step": 15075 + }, + { + "epoch": 1.6556116846035582, + "grad_norm": 2.020448923110962, + "learning_rate": 1e-06, + "loss": 0.9039, + "mean_token_accuracy": 0.7246813774108887, + "num_tokens": 381558506.0, + "step": 15076 + }, + { + "epoch": 1.6557215023061718, + "grad_norm": 2.4086804389953613, + "learning_rate": 1e-06, + "loss": 0.8749, + "mean_token_accuracy": 0.7280898094177246, + "num_tokens": 381582427.0, + "step": 15077 + }, + { + "epoch": 1.6558313200087853, + "grad_norm": 2.489715337753296, + "learning_rate": 1e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7189433574676514, + "num_tokens": 381604661.0, + "step": 15078 + }, + { + "epoch": 1.655941137711399, + "grad_norm": 2.7224111557006836, + "learning_rate": 1e-06, + "loss": 0.8528, + "mean_token_accuracy": 0.7328238487243652, + "num_tokens": 381625486.0, + "step": 15079 + }, + { + "epoch": 1.6560509554140128, + "grad_norm": 2.15043306350708, + "learning_rate": 1e-06, + "loss": 0.8924, + "mean_token_accuracy": 0.7269712090492249, + "num_tokens": 381654787.0, + "step": 15080 + }, + { + "epoch": 1.6561607731166264, + "grad_norm": 2.3487203121185303, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7072911858558655, + "num_tokens": 381677891.0, + "step": 15081 + }, + { + "epoch": 1.65627059081924, + "grad_norm": 2.050110340118408, + "learning_rate": 1e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.703215479850769, + "num_tokens": 381708306.0, + "step": 15082 + }, + { + "epoch": 1.6563804085218536, + "grad_norm": 2.1345760822296143, + "learning_rate": 1e-06, + "loss": 0.9373, + "mean_token_accuracy": 0.7154513001441956, + "num_tokens": 381735076.0, + "step": 15083 + }, + { + "epoch": 1.6564902262244674, + "grad_norm": 2.5685040950775146, + "learning_rate": 1e-06, + "loss": 0.8389, + "mean_token_accuracy": 0.7401285171508789, + "num_tokens": 381754743.0, + "step": 15084 + }, + { + "epoch": 1.6566000439270812, + "grad_norm": 2.2300662994384766, + "learning_rate": 1e-06, + "loss": 0.8688, + "mean_token_accuracy": 0.7317746877670288, + "num_tokens": 381781657.0, + "step": 15085 + }, + { + "epoch": 1.6567098616296947, + "grad_norm": 2.0204219818115234, + "learning_rate": 1e-06, + "loss": 0.8934, + "mean_token_accuracy": 0.7208886742591858, + "num_tokens": 381810625.0, + "step": 15086 + }, + { + "epoch": 1.6568196793323082, + "grad_norm": 2.0110440254211426, + "learning_rate": 1e-06, + "loss": 0.8915, + "mean_token_accuracy": 0.7229386568069458, + "num_tokens": 381841176.0, + "step": 15087 + }, + { + "epoch": 1.656929497034922, + "grad_norm": 2.441876173019409, + "learning_rate": 1e-06, + "loss": 0.7984, + "mean_token_accuracy": 0.7440965175628662, + "num_tokens": 381862403.0, + "step": 15088 + }, + { + "epoch": 1.6570393147375357, + "grad_norm": 2.277146816253662, + "learning_rate": 1e-06, + "loss": 0.8052, + "mean_token_accuracy": 0.7456099987030029, + "num_tokens": 381887406.0, + "step": 15089 + }, + { + "epoch": 1.6571491324401495, + "grad_norm": 2.209502696990967, + "learning_rate": 1e-06, + "loss": 0.8793, + "mean_token_accuracy": 0.7213417291641235, + "num_tokens": 381914080.0, + "step": 15090 + }, + { + "epoch": 1.657258950142763, + "grad_norm": 1.9980565309524536, + "learning_rate": 1e-06, + "loss": 0.8697, + "mean_token_accuracy": 0.7317593097686768, + "num_tokens": 381943518.0, + "step": 15091 + }, + { + "epoch": 1.6573687678453766, + "grad_norm": 2.1940906047821045, + "learning_rate": 1e-06, + "loss": 0.9532, + "mean_token_accuracy": 0.7045158743858337, + "num_tokens": 381971277.0, + "step": 15092 + }, + { + "epoch": 1.6574785855479903, + "grad_norm": 2.4519567489624023, + "learning_rate": 1e-06, + "loss": 0.8542, + "mean_token_accuracy": 0.7397779226303101, + "num_tokens": 381992669.0, + "step": 15093 + }, + { + "epoch": 1.657588403250604, + "grad_norm": 2.2823944091796875, + "learning_rate": 1e-06, + "loss": 0.8144, + "mean_token_accuracy": 0.7420921325683594, + "num_tokens": 382016872.0, + "step": 15094 + }, + { + "epoch": 1.6576982209532176, + "grad_norm": 2.5286312103271484, + "learning_rate": 1e-06, + "loss": 0.8615, + "mean_token_accuracy": 0.7281363010406494, + "num_tokens": 382038620.0, + "step": 15095 + }, + { + "epoch": 1.6578080386558314, + "grad_norm": 2.3366363048553467, + "learning_rate": 1e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.7114458084106445, + "num_tokens": 382065637.0, + "step": 15096 + }, + { + "epoch": 1.657917856358445, + "grad_norm": 2.2975945472717285, + "learning_rate": 1e-06, + "loss": 0.8798, + "mean_token_accuracy": 0.7289036512374878, + "num_tokens": 382090711.0, + "step": 15097 + }, + { + "epoch": 1.6580276740610587, + "grad_norm": 2.0575108528137207, + "learning_rate": 1e-06, + "loss": 0.9352, + "mean_token_accuracy": 0.714318037033081, + "num_tokens": 382120452.0, + "step": 15098 + }, + { + "epoch": 1.6581374917636724, + "grad_norm": 2.121796131134033, + "learning_rate": 1e-06, + "loss": 0.8794, + "mean_token_accuracy": 0.7260450124740601, + "num_tokens": 382148405.0, + "step": 15099 + }, + { + "epoch": 1.658247309466286, + "grad_norm": 2.335146188735962, + "learning_rate": 1e-06, + "loss": 0.8447, + "mean_token_accuracy": 0.74260413646698, + "num_tokens": 382170109.0, + "step": 15100 + }, + { + "epoch": 1.6583571271688995, + "grad_norm": 2.755345344543457, + "learning_rate": 1e-06, + "loss": 0.8057, + "mean_token_accuracy": 0.7413121461868286, + "num_tokens": 382188656.0, + "step": 15101 + }, + { + "epoch": 1.6584669448715132, + "grad_norm": 2.2247982025146484, + "learning_rate": 1e-06, + "loss": 0.8828, + "mean_token_accuracy": 0.7309922575950623, + "num_tokens": 382212753.0, + "step": 15102 + }, + { + "epoch": 1.658576762574127, + "grad_norm": 2.228280544281006, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.718012809753418, + "num_tokens": 382238365.0, + "step": 15103 + }, + { + "epoch": 1.6586865802767408, + "grad_norm": 2.5189850330352783, + "learning_rate": 1e-06, + "loss": 0.7878, + "mean_token_accuracy": 0.752556324005127, + "num_tokens": 382257546.0, + "step": 15104 + }, + { + "epoch": 1.6587963979793543, + "grad_norm": 2.3069183826446533, + "learning_rate": 1e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.71622234582901, + "num_tokens": 382283725.0, + "step": 15105 + }, + { + "epoch": 1.6589062156819678, + "grad_norm": 2.3633923530578613, + "learning_rate": 1e-06, + "loss": 0.932, + "mean_token_accuracy": 0.7042579054832458, + "num_tokens": 382306739.0, + "step": 15106 + }, + { + "epoch": 1.6590160333845816, + "grad_norm": 2.0777406692504883, + "learning_rate": 1e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.6971093416213989, + "num_tokens": 382336450.0, + "step": 15107 + }, + { + "epoch": 1.6591258510871953, + "grad_norm": 2.182382822036743, + "learning_rate": 1e-06, + "loss": 0.8711, + "mean_token_accuracy": 0.7248508334159851, + "num_tokens": 382362255.0, + "step": 15108 + }, + { + "epoch": 1.6592356687898089, + "grad_norm": 2.2486486434936523, + "learning_rate": 1e-06, + "loss": 0.7609, + "mean_token_accuracy": 0.757373034954071, + "num_tokens": 382384636.0, + "step": 15109 + }, + { + "epoch": 1.6593454864924224, + "grad_norm": 2.3801746368408203, + "learning_rate": 1e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.7221809029579163, + "num_tokens": 382405920.0, + "step": 15110 + }, + { + "epoch": 1.6594553041950362, + "grad_norm": 2.1045925617218018, + "learning_rate": 1e-06, + "loss": 0.9565, + "mean_token_accuracy": 0.7026562690734863, + "num_tokens": 382432758.0, + "step": 15111 + }, + { + "epoch": 1.65956512189765, + "grad_norm": 2.050224542617798, + "learning_rate": 1e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.6982378363609314, + "num_tokens": 382461293.0, + "step": 15112 + }, + { + "epoch": 1.6596749396002637, + "grad_norm": 2.3121066093444824, + "learning_rate": 1e-06, + "loss": 0.8574, + "mean_token_accuracy": 0.7376840710639954, + "num_tokens": 382482073.0, + "step": 15113 + }, + { + "epoch": 1.6597847573028772, + "grad_norm": 2.306537389755249, + "learning_rate": 1e-06, + "loss": 0.9029, + "mean_token_accuracy": 0.7217507362365723, + "num_tokens": 382506630.0, + "step": 15114 + }, + { + "epoch": 1.6598945750054908, + "grad_norm": 2.284071922302246, + "learning_rate": 1e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.7140465974807739, + "num_tokens": 382531585.0, + "step": 15115 + }, + { + "epoch": 1.6600043927081045, + "grad_norm": 2.2373197078704834, + "learning_rate": 1e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.7143956422805786, + "num_tokens": 382559009.0, + "step": 15116 + }, + { + "epoch": 1.6601142104107183, + "grad_norm": 2.1497385501861572, + "learning_rate": 1e-06, + "loss": 0.8542, + "mean_token_accuracy": 0.7290704250335693, + "num_tokens": 382586713.0, + "step": 15117 + }, + { + "epoch": 1.660224028113332, + "grad_norm": 2.0100975036621094, + "learning_rate": 1e-06, + "loss": 0.9209, + "mean_token_accuracy": 0.7119368314743042, + "num_tokens": 382616589.0, + "step": 15118 + }, + { + "epoch": 1.6603338458159456, + "grad_norm": 2.2469735145568848, + "learning_rate": 1e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7190290689468384, + "num_tokens": 382643842.0, + "step": 15119 + }, + { + "epoch": 1.660443663518559, + "grad_norm": 2.555504322052002, + "learning_rate": 1e-06, + "loss": 0.9011, + "mean_token_accuracy": 0.732404351234436, + "num_tokens": 382664558.0, + "step": 15120 + }, + { + "epoch": 1.6605534812211729, + "grad_norm": 2.084329843521118, + "learning_rate": 1e-06, + "loss": 0.8173, + "mean_token_accuracy": 0.7361550331115723, + "num_tokens": 382690008.0, + "step": 15121 + }, + { + "epoch": 1.6606632989237866, + "grad_norm": 2.130647897720337, + "learning_rate": 1e-06, + "loss": 0.7929, + "mean_token_accuracy": 0.7498971223831177, + "num_tokens": 382714339.0, + "step": 15122 + }, + { + "epoch": 1.6607731166264001, + "grad_norm": 2.008190155029297, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7146713137626648, + "num_tokens": 382745146.0, + "step": 15123 + }, + { + "epoch": 1.6608829343290137, + "grad_norm": 2.5972657203674316, + "learning_rate": 1e-06, + "loss": 0.8567, + "mean_token_accuracy": 0.7344085574150085, + "num_tokens": 382767611.0, + "step": 15124 + }, + { + "epoch": 1.6609927520316274, + "grad_norm": 2.3180899620056152, + "learning_rate": 1e-06, + "loss": 0.878, + "mean_token_accuracy": 0.7222713232040405, + "num_tokens": 382793032.0, + "step": 15125 + }, + { + "epoch": 1.6611025697342412, + "grad_norm": 2.0960512161254883, + "learning_rate": 1e-06, + "loss": 0.916, + "mean_token_accuracy": 0.7144968509674072, + "num_tokens": 382821733.0, + "step": 15126 + }, + { + "epoch": 1.661212387436855, + "grad_norm": 2.3668053150177, + "learning_rate": 1e-06, + "loss": 0.8944, + "mean_token_accuracy": 0.7193748950958252, + "num_tokens": 382844526.0, + "step": 15127 + }, + { + "epoch": 1.6613222051394685, + "grad_norm": 2.04066801071167, + "learning_rate": 1e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.713448166847229, + "num_tokens": 382873710.0, + "step": 15128 + }, + { + "epoch": 1.661432022842082, + "grad_norm": 2.3593907356262207, + "learning_rate": 1e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.7319592833518982, + "num_tokens": 382898596.0, + "step": 15129 + }, + { + "epoch": 1.6615418405446958, + "grad_norm": 2.1949150562286377, + "learning_rate": 1e-06, + "loss": 0.8384, + "mean_token_accuracy": 0.740757167339325, + "num_tokens": 382924965.0, + "step": 15130 + }, + { + "epoch": 1.6616516582473095, + "grad_norm": 2.13377046585083, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.7020800709724426, + "num_tokens": 382955457.0, + "step": 15131 + }, + { + "epoch": 1.661761475949923, + "grad_norm": 2.0888864994049072, + "learning_rate": 1e-06, + "loss": 0.9356, + "mean_token_accuracy": 0.7134172320365906, + "num_tokens": 382982566.0, + "step": 15132 + }, + { + "epoch": 1.6618712936525368, + "grad_norm": 2.208745002746582, + "learning_rate": 1e-06, + "loss": 0.8823, + "mean_token_accuracy": 0.7255544066429138, + "num_tokens": 383009060.0, + "step": 15133 + }, + { + "epoch": 1.6619811113551504, + "grad_norm": 2.064063549041748, + "learning_rate": 1e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.7247365713119507, + "num_tokens": 383037813.0, + "step": 15134 + }, + { + "epoch": 1.6620909290577641, + "grad_norm": 2.2726123332977295, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7009400129318237, + "num_tokens": 383063786.0, + "step": 15135 + }, + { + "epoch": 1.6622007467603779, + "grad_norm": 2.1785991191864014, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.7082507014274597, + "num_tokens": 383090749.0, + "step": 15136 + }, + { + "epoch": 1.6623105644629914, + "grad_norm": 2.3964052200317383, + "learning_rate": 1e-06, + "loss": 0.8733, + "mean_token_accuracy": 0.7250173091888428, + "num_tokens": 383112470.0, + "step": 15137 + }, + { + "epoch": 1.662420382165605, + "grad_norm": 2.081603527069092, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7239044904708862, + "num_tokens": 383143701.0, + "step": 15138 + }, + { + "epoch": 1.6625301998682187, + "grad_norm": 2.2840795516967773, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7211748361587524, + "num_tokens": 383167231.0, + "step": 15139 + }, + { + "epoch": 1.6626400175708325, + "grad_norm": 1.7740907669067383, + "learning_rate": 1e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.6923031806945801, + "num_tokens": 383204870.0, + "step": 15140 + }, + { + "epoch": 1.6627498352734462, + "grad_norm": 2.710426092147827, + "learning_rate": 1e-06, + "loss": 0.8169, + "mean_token_accuracy": 0.7466213703155518, + "num_tokens": 383222789.0, + "step": 15141 + }, + { + "epoch": 1.6628596529760598, + "grad_norm": 2.110137462615967, + "learning_rate": 1e-06, + "loss": 0.8417, + "mean_token_accuracy": 0.734931468963623, + "num_tokens": 383248561.0, + "step": 15142 + }, + { + "epoch": 1.6629694706786733, + "grad_norm": 2.5197360515594482, + "learning_rate": 1e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.712708592414856, + "num_tokens": 383270587.0, + "step": 15143 + }, + { + "epoch": 1.663079288381287, + "grad_norm": 2.280444860458374, + "learning_rate": 1e-06, + "loss": 0.8444, + "mean_token_accuracy": 0.7354248762130737, + "num_tokens": 383294070.0, + "step": 15144 + }, + { + "epoch": 1.6631891060839008, + "grad_norm": 2.470818281173706, + "learning_rate": 1e-06, + "loss": 0.8311, + "mean_token_accuracy": 0.7395049333572388, + "num_tokens": 383318047.0, + "step": 15145 + }, + { + "epoch": 1.6632989237865143, + "grad_norm": 2.4043545722961426, + "learning_rate": 1e-06, + "loss": 0.8001, + "mean_token_accuracy": 0.7431662082672119, + "num_tokens": 383339500.0, + "step": 15146 + }, + { + "epoch": 1.663408741489128, + "grad_norm": 2.158254384994507, + "learning_rate": 1e-06, + "loss": 0.953, + "mean_token_accuracy": 0.7056479454040527, + "num_tokens": 383366509.0, + "step": 15147 + }, + { + "epoch": 1.6635185591917416, + "grad_norm": 2.3034441471099854, + "learning_rate": 1e-06, + "loss": 0.8187, + "mean_token_accuracy": 0.7377592325210571, + "num_tokens": 383388406.0, + "step": 15148 + }, + { + "epoch": 1.6636283768943554, + "grad_norm": 2.183917999267578, + "learning_rate": 1e-06, + "loss": 0.96, + "mean_token_accuracy": 0.6946076154708862, + "num_tokens": 383416252.0, + "step": 15149 + }, + { + "epoch": 1.6637381945969691, + "grad_norm": 2.0445027351379395, + "learning_rate": 1e-06, + "loss": 0.8451, + "mean_token_accuracy": 0.7321847677230835, + "num_tokens": 383443976.0, + "step": 15150 + }, + { + "epoch": 1.6638480122995827, + "grad_norm": 2.5688276290893555, + "learning_rate": 1e-06, + "loss": 0.8798, + "mean_token_accuracy": 0.7220791578292847, + "num_tokens": 383463636.0, + "step": 15151 + }, + { + "epoch": 1.6639578300021962, + "grad_norm": 2.337592840194702, + "learning_rate": 1e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.7003902196884155, + "num_tokens": 383487416.0, + "step": 15152 + }, + { + "epoch": 1.66406764770481, + "grad_norm": 2.6827616691589355, + "learning_rate": 1e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7277939319610596, + "num_tokens": 383507447.0, + "step": 15153 + }, + { + "epoch": 1.6641774654074237, + "grad_norm": 2.237706184387207, + "learning_rate": 1e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.7048441171646118, + "num_tokens": 383534559.0, + "step": 15154 + }, + { + "epoch": 1.6642872831100375, + "grad_norm": 2.049551486968994, + "learning_rate": 1e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7164151668548584, + "num_tokens": 383564331.0, + "step": 15155 + }, + { + "epoch": 1.664397100812651, + "grad_norm": 2.191443681716919, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.7156434059143066, + "num_tokens": 383591591.0, + "step": 15156 + }, + { + "epoch": 1.6645069185152646, + "grad_norm": 2.0881543159484863, + "learning_rate": 1e-06, + "loss": 0.8231, + "mean_token_accuracy": 0.7467362880706787, + "num_tokens": 383619977.0, + "step": 15157 + }, + { + "epoch": 1.6646167362178783, + "grad_norm": 2.3612992763519287, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.708549976348877, + "num_tokens": 383643462.0, + "step": 15158 + }, + { + "epoch": 1.664726553920492, + "grad_norm": 2.337066888809204, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7091007232666016, + "num_tokens": 383669610.0, + "step": 15159 + }, + { + "epoch": 1.6648363716231056, + "grad_norm": 2.2730884552001953, + "learning_rate": 1e-06, + "loss": 0.8296, + "mean_token_accuracy": 0.7377501726150513, + "num_tokens": 383693518.0, + "step": 15160 + }, + { + "epoch": 1.6649461893257194, + "grad_norm": 2.227210521697998, + "learning_rate": 1e-06, + "loss": 0.8657, + "mean_token_accuracy": 0.7365796566009521, + "num_tokens": 383717973.0, + "step": 15161 + }, + { + "epoch": 1.665056007028333, + "grad_norm": 1.9729914665222168, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7064984440803528, + "num_tokens": 383752544.0, + "step": 15162 + }, + { + "epoch": 1.6651658247309467, + "grad_norm": 1.9506583213806152, + "learning_rate": 1e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.7109165191650391, + "num_tokens": 383786171.0, + "step": 15163 + }, + { + "epoch": 1.6652756424335604, + "grad_norm": 2.2374298572540283, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.6936720609664917, + "num_tokens": 383812162.0, + "step": 15164 + }, + { + "epoch": 1.665385460136174, + "grad_norm": 2.3581745624542236, + "learning_rate": 1e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.7183099389076233, + "num_tokens": 383834689.0, + "step": 15165 + }, + { + "epoch": 1.6654952778387875, + "grad_norm": 2.226792335510254, + "learning_rate": 1e-06, + "loss": 0.8591, + "mean_token_accuracy": 0.7282407879829407, + "num_tokens": 383860861.0, + "step": 15166 + }, + { + "epoch": 1.6656050955414012, + "grad_norm": 2.400963068008423, + "learning_rate": 1e-06, + "loss": 0.8861, + "mean_token_accuracy": 0.7276005148887634, + "num_tokens": 383882326.0, + "step": 15167 + }, + { + "epoch": 1.665714913244015, + "grad_norm": 2.1249895095825195, + "learning_rate": 1e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.7116490602493286, + "num_tokens": 383911358.0, + "step": 15168 + }, + { + "epoch": 1.6658247309466288, + "grad_norm": 2.367865800857544, + "learning_rate": 1e-06, + "loss": 0.8807, + "mean_token_accuracy": 0.7427435517311096, + "num_tokens": 383934297.0, + "step": 15169 + }, + { + "epoch": 1.6659345486492423, + "grad_norm": 2.0741710662841797, + "learning_rate": 1e-06, + "loss": 0.9214, + "mean_token_accuracy": 0.7084432244300842, + "num_tokens": 383962223.0, + "step": 15170 + }, + { + "epoch": 1.6660443663518558, + "grad_norm": 2.1647610664367676, + "learning_rate": 1e-06, + "loss": 0.8513, + "mean_token_accuracy": 0.732657790184021, + "num_tokens": 383990550.0, + "step": 15171 + }, + { + "epoch": 1.6661541840544696, + "grad_norm": 2.1036980152130127, + "learning_rate": 1e-06, + "loss": 0.985, + "mean_token_accuracy": 0.6957906484603882, + "num_tokens": 384021023.0, + "step": 15172 + }, + { + "epoch": 1.6662640017570833, + "grad_norm": 2.170607805252075, + "learning_rate": 1e-06, + "loss": 0.8689, + "mean_token_accuracy": 0.7251505255699158, + "num_tokens": 384047720.0, + "step": 15173 + }, + { + "epoch": 1.6663738194596969, + "grad_norm": 2.038486957550049, + "learning_rate": 1e-06, + "loss": 0.9182, + "mean_token_accuracy": 0.7154886722564697, + "num_tokens": 384077524.0, + "step": 15174 + }, + { + "epoch": 1.6664836371623104, + "grad_norm": 2.1635425090789795, + "learning_rate": 1e-06, + "loss": 0.8785, + "mean_token_accuracy": 0.7268763780593872, + "num_tokens": 384103188.0, + "step": 15175 + }, + { + "epoch": 1.6665934548649242, + "grad_norm": 2.004875659942627, + "learning_rate": 1e-06, + "loss": 0.8478, + "mean_token_accuracy": 0.7347110509872437, + "num_tokens": 384134381.0, + "step": 15176 + }, + { + "epoch": 1.666703272567538, + "grad_norm": 2.291950225830078, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.6992979049682617, + "num_tokens": 384161511.0, + "step": 15177 + }, + { + "epoch": 1.6668130902701517, + "grad_norm": 2.0411720275878906, + "learning_rate": 1e-06, + "loss": 0.8983, + "mean_token_accuracy": 0.7290753722190857, + "num_tokens": 384189319.0, + "step": 15178 + }, + { + "epoch": 1.6669229079727652, + "grad_norm": 3.0090959072113037, + "learning_rate": 1e-06, + "loss": 0.885, + "mean_token_accuracy": 0.7225682139396667, + "num_tokens": 384214704.0, + "step": 15179 + }, + { + "epoch": 1.6670327256753787, + "grad_norm": 2.0043351650238037, + "learning_rate": 1e-06, + "loss": 0.9643, + "mean_token_accuracy": 0.6986089944839478, + "num_tokens": 384246846.0, + "step": 15180 + }, + { + "epoch": 1.6671425433779925, + "grad_norm": 2.4317970275878906, + "learning_rate": 1e-06, + "loss": 0.8355, + "mean_token_accuracy": 0.7343933582305908, + "num_tokens": 384269138.0, + "step": 15181 + }, + { + "epoch": 1.6672523610806063, + "grad_norm": 2.360426664352417, + "learning_rate": 1e-06, + "loss": 0.8479, + "mean_token_accuracy": 0.7373722791671753, + "num_tokens": 384292399.0, + "step": 15182 + }, + { + "epoch": 1.66736217878322, + "grad_norm": 2.2586448192596436, + "learning_rate": 1e-06, + "loss": 0.8443, + "mean_token_accuracy": 0.7386584281921387, + "num_tokens": 384315281.0, + "step": 15183 + }, + { + "epoch": 1.6674719964858336, + "grad_norm": 2.3656728267669678, + "learning_rate": 1e-06, + "loss": 0.8108, + "mean_token_accuracy": 0.7400546073913574, + "num_tokens": 384337620.0, + "step": 15184 + }, + { + "epoch": 1.667581814188447, + "grad_norm": 2.632767915725708, + "learning_rate": 1e-06, + "loss": 0.8677, + "mean_token_accuracy": 0.7234660387039185, + "num_tokens": 384357686.0, + "step": 15185 + }, + { + "epoch": 1.6676916318910608, + "grad_norm": 2.118222951889038, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7054578065872192, + "num_tokens": 384386298.0, + "step": 15186 + }, + { + "epoch": 1.6678014495936746, + "grad_norm": 2.4397900104522705, + "learning_rate": 1e-06, + "loss": 0.9225, + "mean_token_accuracy": 0.7178401947021484, + "num_tokens": 384408507.0, + "step": 15187 + }, + { + "epoch": 1.6679112672962881, + "grad_norm": 2.3675670623779297, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7088437080383301, + "num_tokens": 384432476.0, + "step": 15188 + }, + { + "epoch": 1.6680210849989017, + "grad_norm": 2.3385848999023438, + "learning_rate": 1e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.7134826183319092, + "num_tokens": 384456571.0, + "step": 15189 + }, + { + "epoch": 1.6681309027015154, + "grad_norm": 2.4681460857391357, + "learning_rate": 1e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.7141240239143372, + "num_tokens": 384478723.0, + "step": 15190 + }, + { + "epoch": 1.6682407204041292, + "grad_norm": 2.0396201610565186, + "learning_rate": 1e-06, + "loss": 0.8492, + "mean_token_accuracy": 0.7426039576530457, + "num_tokens": 384507567.0, + "step": 15191 + }, + { + "epoch": 1.668350538106743, + "grad_norm": 2.0059850215911865, + "learning_rate": 1e-06, + "loss": 0.8632, + "mean_token_accuracy": 0.7281202077865601, + "num_tokens": 384538695.0, + "step": 15192 + }, + { + "epoch": 1.6684603558093565, + "grad_norm": 2.1478211879730225, + "learning_rate": 1e-06, + "loss": 0.981, + "mean_token_accuracy": 0.698508083820343, + "num_tokens": 384566366.0, + "step": 15193 + }, + { + "epoch": 1.66857017351197, + "grad_norm": 2.447357654571533, + "learning_rate": 1e-06, + "loss": 0.8245, + "mean_token_accuracy": 0.7409995794296265, + "num_tokens": 384587055.0, + "step": 15194 + }, + { + "epoch": 1.6686799912145838, + "grad_norm": 2.5540692806243896, + "learning_rate": 1e-06, + "loss": 0.8464, + "mean_token_accuracy": 0.7348746061325073, + "num_tokens": 384605937.0, + "step": 15195 + }, + { + "epoch": 1.6687898089171975, + "grad_norm": 2.2247512340545654, + "learning_rate": 1e-06, + "loss": 0.8639, + "mean_token_accuracy": 0.7273037433624268, + "num_tokens": 384631808.0, + "step": 15196 + }, + { + "epoch": 1.668899626619811, + "grad_norm": 2.2363617420196533, + "learning_rate": 1e-06, + "loss": 0.8733, + "mean_token_accuracy": 0.728186845779419, + "num_tokens": 384656455.0, + "step": 15197 + }, + { + "epoch": 1.6690094443224248, + "grad_norm": 2.29630970954895, + "learning_rate": 1e-06, + "loss": 0.9186, + "mean_token_accuracy": 0.7138416767120361, + "num_tokens": 384682353.0, + "step": 15198 + }, + { + "epoch": 1.6691192620250384, + "grad_norm": 1.8904788494110107, + "learning_rate": 1e-06, + "loss": 0.8535, + "mean_token_accuracy": 0.7399768829345703, + "num_tokens": 384715160.0, + "step": 15199 + }, + { + "epoch": 1.669229079727652, + "grad_norm": 2.2395873069763184, + "learning_rate": 1e-06, + "loss": 0.8653, + "mean_token_accuracy": 0.7266730070114136, + "num_tokens": 384741910.0, + "step": 15200 + }, + { + "epoch": 1.6693388974302659, + "grad_norm": 2.1915199756622314, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7232409715652466, + "num_tokens": 384769202.0, + "step": 15201 + }, + { + "epoch": 1.6694487151328794, + "grad_norm": 2.2751071453094482, + "learning_rate": 1e-06, + "loss": 0.9098, + "mean_token_accuracy": 0.7129834890365601, + "num_tokens": 384793476.0, + "step": 15202 + }, + { + "epoch": 1.669558532835493, + "grad_norm": 2.3031668663024902, + "learning_rate": 1e-06, + "loss": 0.889, + "mean_token_accuracy": 0.7256161570549011, + "num_tokens": 384817656.0, + "step": 15203 + }, + { + "epoch": 1.6696683505381067, + "grad_norm": 2.417598247528076, + "learning_rate": 1e-06, + "loss": 0.786, + "mean_token_accuracy": 0.7477030754089355, + "num_tokens": 384838407.0, + "step": 15204 + }, + { + "epoch": 1.6697781682407205, + "grad_norm": 2.4356677532196045, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.7080950140953064, + "num_tokens": 384862803.0, + "step": 15205 + }, + { + "epoch": 1.6698879859433342, + "grad_norm": 2.2709836959838867, + "learning_rate": 1e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.7228095531463623, + "num_tokens": 384888749.0, + "step": 15206 + }, + { + "epoch": 1.6699978036459477, + "grad_norm": 2.056295394897461, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7085585594177246, + "num_tokens": 384917737.0, + "step": 15207 + }, + { + "epoch": 1.6701076213485613, + "grad_norm": 2.1163556575775146, + "learning_rate": 1e-06, + "loss": 0.9256, + "mean_token_accuracy": 0.7158789038658142, + "num_tokens": 384945744.0, + "step": 15208 + }, + { + "epoch": 1.670217439051175, + "grad_norm": 2.4176008701324463, + "learning_rate": 1e-06, + "loss": 0.91, + "mean_token_accuracy": 0.7119016647338867, + "num_tokens": 384968005.0, + "step": 15209 + }, + { + "epoch": 1.6703272567537888, + "grad_norm": 2.5307414531707764, + "learning_rate": 1e-06, + "loss": 0.9088, + "mean_token_accuracy": 0.7125028967857361, + "num_tokens": 384989192.0, + "step": 15210 + }, + { + "epoch": 1.6704370744564023, + "grad_norm": 2.4361963272094727, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7140311598777771, + "num_tokens": 385011688.0, + "step": 15211 + }, + { + "epoch": 1.670546892159016, + "grad_norm": 2.326601982116699, + "learning_rate": 1e-06, + "loss": 0.8544, + "mean_token_accuracy": 0.7391008138656616, + "num_tokens": 385034308.0, + "step": 15212 + }, + { + "epoch": 1.6706567098616296, + "grad_norm": 2.762406826019287, + "learning_rate": 1e-06, + "loss": 0.8203, + "mean_token_accuracy": 0.7357316613197327, + "num_tokens": 385051820.0, + "step": 15213 + }, + { + "epoch": 1.6707665275642434, + "grad_norm": 2.34649920463562, + "learning_rate": 1e-06, + "loss": 0.8733, + "mean_token_accuracy": 0.7300161123275757, + "num_tokens": 385072935.0, + "step": 15214 + }, + { + "epoch": 1.6708763452668571, + "grad_norm": 2.551126718521118, + "learning_rate": 1e-06, + "loss": 0.9166, + "mean_token_accuracy": 0.716155469417572, + "num_tokens": 385094737.0, + "step": 15215 + }, + { + "epoch": 1.6709861629694707, + "grad_norm": 2.3510913848876953, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.7214664220809937, + "num_tokens": 385119110.0, + "step": 15216 + }, + { + "epoch": 1.6710959806720842, + "grad_norm": 2.170743227005005, + "learning_rate": 1e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.7168611884117126, + "num_tokens": 385144932.0, + "step": 15217 + }, + { + "epoch": 1.671205798374698, + "grad_norm": 2.253528356552124, + "learning_rate": 1e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.7198119759559631, + "num_tokens": 385170554.0, + "step": 15218 + }, + { + "epoch": 1.6713156160773117, + "grad_norm": 2.18898344039917, + "learning_rate": 1e-06, + "loss": 0.8405, + "mean_token_accuracy": 0.7371670007705688, + "num_tokens": 385195353.0, + "step": 15219 + }, + { + "epoch": 1.6714254337799255, + "grad_norm": 2.17029070854187, + "learning_rate": 1e-06, + "loss": 0.9366, + "mean_token_accuracy": 0.7124384045600891, + "num_tokens": 385224776.0, + "step": 15220 + }, + { + "epoch": 1.671535251482539, + "grad_norm": 2.0862133502960205, + "learning_rate": 1e-06, + "loss": 0.8774, + "mean_token_accuracy": 0.7280060052871704, + "num_tokens": 385252057.0, + "step": 15221 + }, + { + "epoch": 1.6716450691851525, + "grad_norm": 2.231271743774414, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.6964167356491089, + "num_tokens": 385278916.0, + "step": 15222 + }, + { + "epoch": 1.6717548868877663, + "grad_norm": 2.4654884338378906, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7199162244796753, + "num_tokens": 385299632.0, + "step": 15223 + }, + { + "epoch": 1.67186470459038, + "grad_norm": 2.2560150623321533, + "learning_rate": 1e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.7224825620651245, + "num_tokens": 385326278.0, + "step": 15224 + }, + { + "epoch": 1.6719745222929936, + "grad_norm": 2.2212765216827393, + "learning_rate": 1e-06, + "loss": 0.9154, + "mean_token_accuracy": 0.7159707546234131, + "num_tokens": 385352124.0, + "step": 15225 + }, + { + "epoch": 1.6720843399956071, + "grad_norm": 2.4372172355651855, + "learning_rate": 1e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.7100508213043213, + "num_tokens": 385373933.0, + "step": 15226 + }, + { + "epoch": 1.6721941576982209, + "grad_norm": 1.9589827060699463, + "learning_rate": 1e-06, + "loss": 0.8987, + "mean_token_accuracy": 0.725088894367218, + "num_tokens": 385404699.0, + "step": 15227 + }, + { + "epoch": 1.6723039754008346, + "grad_norm": 2.6252505779266357, + "learning_rate": 1e-06, + "loss": 0.7909, + "mean_token_accuracy": 0.7458539009094238, + "num_tokens": 385422398.0, + "step": 15228 + }, + { + "epoch": 1.6724137931034484, + "grad_norm": 2.0673160552978516, + "learning_rate": 1e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.7043373584747314, + "num_tokens": 385452083.0, + "step": 15229 + }, + { + "epoch": 1.672523610806062, + "grad_norm": 2.196672201156616, + "learning_rate": 1e-06, + "loss": 0.9279, + "mean_token_accuracy": 0.72231525182724, + "num_tokens": 385479713.0, + "step": 15230 + }, + { + "epoch": 1.6726334285086755, + "grad_norm": 2.4937186241149902, + "learning_rate": 1e-06, + "loss": 0.865, + "mean_token_accuracy": 0.7344589233398438, + "num_tokens": 385500314.0, + "step": 15231 + }, + { + "epoch": 1.6727432462112892, + "grad_norm": 2.1597092151641846, + "learning_rate": 1e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.7114819288253784, + "num_tokens": 385527804.0, + "step": 15232 + }, + { + "epoch": 1.672853063913903, + "grad_norm": 2.593837022781372, + "learning_rate": 1e-06, + "loss": 0.8212, + "mean_token_accuracy": 0.7349398732185364, + "num_tokens": 385548509.0, + "step": 15233 + }, + { + "epoch": 1.6729628816165167, + "grad_norm": 2.7082924842834473, + "learning_rate": 1e-06, + "loss": 0.858, + "mean_token_accuracy": 0.730666995048523, + "num_tokens": 385566340.0, + "step": 15234 + }, + { + "epoch": 1.6730726993191303, + "grad_norm": 2.648552656173706, + "learning_rate": 1e-06, + "loss": 0.8674, + "mean_token_accuracy": 0.7307747006416321, + "num_tokens": 385586768.0, + "step": 15235 + }, + { + "epoch": 1.6731825170217438, + "grad_norm": 2.292764186859131, + "learning_rate": 1e-06, + "loss": 0.8291, + "mean_token_accuracy": 0.7358126640319824, + "num_tokens": 385610653.0, + "step": 15236 + }, + { + "epoch": 1.6732923347243576, + "grad_norm": 2.182528495788574, + "learning_rate": 1e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.7113414406776428, + "num_tokens": 385637707.0, + "step": 15237 + }, + { + "epoch": 1.6734021524269713, + "grad_norm": 2.532498836517334, + "learning_rate": 1e-06, + "loss": 0.8662, + "mean_token_accuracy": 0.7392228245735168, + "num_tokens": 385657018.0, + "step": 15238 + }, + { + "epoch": 1.6735119701295849, + "grad_norm": 2.8384759426116943, + "learning_rate": 1e-06, + "loss": 0.7878, + "mean_token_accuracy": 0.7457329034805298, + "num_tokens": 385674687.0, + "step": 15239 + }, + { + "epoch": 1.6736217878321984, + "grad_norm": 2.335955858230591, + "learning_rate": 1e-06, + "loss": 0.8994, + "mean_token_accuracy": 0.7264856100082397, + "num_tokens": 385698282.0, + "step": 15240 + }, + { + "epoch": 1.6737316055348122, + "grad_norm": 2.388751268386841, + "learning_rate": 1e-06, + "loss": 0.8091, + "mean_token_accuracy": 0.7407630681991577, + "num_tokens": 385720172.0, + "step": 15241 + }, + { + "epoch": 1.673841423237426, + "grad_norm": 2.283644914627075, + "learning_rate": 1e-06, + "loss": 0.8837, + "mean_token_accuracy": 0.7265956401824951, + "num_tokens": 385744353.0, + "step": 15242 + }, + { + "epoch": 1.6739512409400397, + "grad_norm": 2.21836256980896, + "learning_rate": 1e-06, + "loss": 0.9174, + "mean_token_accuracy": 0.721866250038147, + "num_tokens": 385770628.0, + "step": 15243 + }, + { + "epoch": 1.6740610586426532, + "grad_norm": 2.4294509887695312, + "learning_rate": 1e-06, + "loss": 0.8262, + "mean_token_accuracy": 0.7473347187042236, + "num_tokens": 385791833.0, + "step": 15244 + }, + { + "epoch": 1.6741708763452667, + "grad_norm": 2.223470687866211, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.7012250423431396, + "num_tokens": 385818557.0, + "step": 15245 + }, + { + "epoch": 1.6742806940478805, + "grad_norm": 2.498286485671997, + "learning_rate": 1e-06, + "loss": 0.8542, + "mean_token_accuracy": 0.726637065410614, + "num_tokens": 385839483.0, + "step": 15246 + }, + { + "epoch": 1.6743905117504942, + "grad_norm": 2.035552978515625, + "learning_rate": 1e-06, + "loss": 0.9075, + "mean_token_accuracy": 0.7194280028343201, + "num_tokens": 385868288.0, + "step": 15247 + }, + { + "epoch": 1.674500329453108, + "grad_norm": 1.9577301740646362, + "learning_rate": 1e-06, + "loss": 0.8527, + "mean_token_accuracy": 0.72780841588974, + "num_tokens": 385902193.0, + "step": 15248 + }, + { + "epoch": 1.6746101471557215, + "grad_norm": 2.313014507293701, + "learning_rate": 1e-06, + "loss": 0.9117, + "mean_token_accuracy": 0.7140578627586365, + "num_tokens": 385925523.0, + "step": 15249 + }, + { + "epoch": 1.674719964858335, + "grad_norm": 2.1315038204193115, + "learning_rate": 1e-06, + "loss": 0.8535, + "mean_token_accuracy": 0.7290758490562439, + "num_tokens": 385953414.0, + "step": 15250 + }, + { + "epoch": 1.6748297825609488, + "grad_norm": 1.965653896331787, + "learning_rate": 1e-06, + "loss": 0.9049, + "mean_token_accuracy": 0.7219043374061584, + "num_tokens": 385985275.0, + "step": 15251 + }, + { + "epoch": 1.6749396002635626, + "grad_norm": 2.397003173828125, + "learning_rate": 1e-06, + "loss": 0.9051, + "mean_token_accuracy": 0.7129533290863037, + "num_tokens": 386007504.0, + "step": 15252 + }, + { + "epoch": 1.6750494179661761, + "grad_norm": 2.3285224437713623, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.7019010782241821, + "num_tokens": 386032105.0, + "step": 15253 + }, + { + "epoch": 1.6751592356687897, + "grad_norm": 2.1983773708343506, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7116894125938416, + "num_tokens": 386059085.0, + "step": 15254 + }, + { + "epoch": 1.6752690533714034, + "grad_norm": 2.7834279537200928, + "learning_rate": 1e-06, + "loss": 0.8531, + "mean_token_accuracy": 0.7356691956520081, + "num_tokens": 386078044.0, + "step": 15255 + }, + { + "epoch": 1.6753788710740172, + "grad_norm": 2.2846407890319824, + "learning_rate": 1e-06, + "loss": 0.8498, + "mean_token_accuracy": 0.7302517890930176, + "num_tokens": 386101598.0, + "step": 15256 + }, + { + "epoch": 1.675488688776631, + "grad_norm": 2.35494065284729, + "learning_rate": 1e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.7112971544265747, + "num_tokens": 386127591.0, + "step": 15257 + }, + { + "epoch": 1.6755985064792445, + "grad_norm": 2.1306023597717285, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7052198052406311, + "num_tokens": 386155147.0, + "step": 15258 + }, + { + "epoch": 1.675708324181858, + "grad_norm": 2.14829683303833, + "learning_rate": 1e-06, + "loss": 0.9168, + "mean_token_accuracy": 0.7188003063201904, + "num_tokens": 386182285.0, + "step": 15259 + }, + { + "epoch": 1.6758181418844718, + "grad_norm": 2.00945782661438, + "learning_rate": 1e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.7195961475372314, + "num_tokens": 386211641.0, + "step": 15260 + }, + { + "epoch": 1.6759279595870855, + "grad_norm": 2.3198421001434326, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7074524760246277, + "num_tokens": 386237136.0, + "step": 15261 + }, + { + "epoch": 1.676037777289699, + "grad_norm": 1.9995568990707397, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7143601179122925, + "num_tokens": 386268461.0, + "step": 15262 + }, + { + "epoch": 1.6761475949923128, + "grad_norm": 2.0456626415252686, + "learning_rate": 1e-06, + "loss": 0.9168, + "mean_token_accuracy": 0.7124715447425842, + "num_tokens": 386300820.0, + "step": 15263 + }, + { + "epoch": 1.6762574126949263, + "grad_norm": 2.3972530364990234, + "learning_rate": 1e-06, + "loss": 0.7754, + "mean_token_accuracy": 0.7545005083084106, + "num_tokens": 386322596.0, + "step": 15264 + }, + { + "epoch": 1.67636723039754, + "grad_norm": 2.0204386711120605, + "learning_rate": 1e-06, + "loss": 0.963, + "mean_token_accuracy": 0.7029615044593811, + "num_tokens": 386352537.0, + "step": 15265 + }, + { + "epoch": 1.6764770481001539, + "grad_norm": 2.238712787628174, + "learning_rate": 1e-06, + "loss": 0.8516, + "mean_token_accuracy": 0.7372438907623291, + "num_tokens": 386377282.0, + "step": 15266 + }, + { + "epoch": 1.6765868658027674, + "grad_norm": 2.05670428276062, + "learning_rate": 1e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7134274244308472, + "num_tokens": 386407236.0, + "step": 15267 + }, + { + "epoch": 1.676696683505381, + "grad_norm": 2.3027844429016113, + "learning_rate": 1e-06, + "loss": 0.8866, + "mean_token_accuracy": 0.7310407757759094, + "num_tokens": 386432115.0, + "step": 15268 + }, + { + "epoch": 1.6768065012079947, + "grad_norm": 2.157001495361328, + "learning_rate": 1e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.7105440497398376, + "num_tokens": 386459578.0, + "step": 15269 + }, + { + "epoch": 1.6769163189106084, + "grad_norm": 2.0256540775299072, + "learning_rate": 1e-06, + "loss": 0.8357, + "mean_token_accuracy": 0.7323050498962402, + "num_tokens": 386487454.0, + "step": 15270 + }, + { + "epoch": 1.6770261366132222, + "grad_norm": 2.2334423065185547, + "learning_rate": 1e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.7269575595855713, + "num_tokens": 386514063.0, + "step": 15271 + }, + { + "epoch": 1.6771359543158357, + "grad_norm": 2.014133930206299, + "learning_rate": 1e-06, + "loss": 0.9317, + "mean_token_accuracy": 0.7096024751663208, + "num_tokens": 386546050.0, + "step": 15272 + }, + { + "epoch": 1.6772457720184493, + "grad_norm": 2.166626214981079, + "learning_rate": 1e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.7093753814697266, + "num_tokens": 386574347.0, + "step": 15273 + }, + { + "epoch": 1.677355589721063, + "grad_norm": 2.123811721801758, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.7053587436676025, + "num_tokens": 386603181.0, + "step": 15274 + }, + { + "epoch": 1.6774654074236768, + "grad_norm": 2.035226583480835, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7075299620628357, + "num_tokens": 386633360.0, + "step": 15275 + }, + { + "epoch": 1.6775752251262903, + "grad_norm": 2.434934377670288, + "learning_rate": 1e-06, + "loss": 0.8947, + "mean_token_accuracy": 0.7176147103309631, + "num_tokens": 386655679.0, + "step": 15276 + }, + { + "epoch": 1.677685042828904, + "grad_norm": 2.12497615814209, + "learning_rate": 1e-06, + "loss": 0.9066, + "mean_token_accuracy": 0.720234215259552, + "num_tokens": 386684411.0, + "step": 15277 + }, + { + "epoch": 1.6777948605315176, + "grad_norm": 2.2103734016418457, + "learning_rate": 1e-06, + "loss": 0.8184, + "mean_token_accuracy": 0.7481532096862793, + "num_tokens": 386709370.0, + "step": 15278 + }, + { + "epoch": 1.6779046782341314, + "grad_norm": 2.494910955429077, + "learning_rate": 1e-06, + "loss": 0.8631, + "mean_token_accuracy": 0.7201014757156372, + "num_tokens": 386730384.0, + "step": 15279 + }, + { + "epoch": 1.6780144959367451, + "grad_norm": 2.2746481895446777, + "learning_rate": 1e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.7154511213302612, + "num_tokens": 386755625.0, + "step": 15280 + }, + { + "epoch": 1.6781243136393587, + "grad_norm": 2.056489944458008, + "learning_rate": 1e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.7037961483001709, + "num_tokens": 386786295.0, + "step": 15281 + }, + { + "epoch": 1.6782341313419722, + "grad_norm": 2.311290740966797, + "learning_rate": 1e-06, + "loss": 0.8614, + "mean_token_accuracy": 0.7358781099319458, + "num_tokens": 386809753.0, + "step": 15282 + }, + { + "epoch": 1.678343949044586, + "grad_norm": 2.336153984069824, + "learning_rate": 1e-06, + "loss": 0.8587, + "mean_token_accuracy": 0.7325600981712341, + "num_tokens": 386833331.0, + "step": 15283 + }, + { + "epoch": 1.6784537667471997, + "grad_norm": 2.3522305488586426, + "learning_rate": 1e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.7157571315765381, + "num_tokens": 386857893.0, + "step": 15284 + }, + { + "epoch": 1.6785635844498135, + "grad_norm": 2.3950576782226562, + "learning_rate": 1e-06, + "loss": 0.7576, + "mean_token_accuracy": 0.7656012773513794, + "num_tokens": 386879249.0, + "step": 15285 + }, + { + "epoch": 1.678673402152427, + "grad_norm": 2.478973388671875, + "learning_rate": 1e-06, + "loss": 0.8059, + "mean_token_accuracy": 0.7423334121704102, + "num_tokens": 386899771.0, + "step": 15286 + }, + { + "epoch": 1.6787832198550405, + "grad_norm": 2.1639044284820557, + "learning_rate": 1e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7084289789199829, + "num_tokens": 386929344.0, + "step": 15287 + }, + { + "epoch": 1.6788930375576543, + "grad_norm": 2.614952802658081, + "learning_rate": 1e-06, + "loss": 0.8471, + "mean_token_accuracy": 0.7315768599510193, + "num_tokens": 386947861.0, + "step": 15288 + }, + { + "epoch": 1.679002855260268, + "grad_norm": 2.3631629943847656, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.7045301198959351, + "num_tokens": 386972382.0, + "step": 15289 + }, + { + "epoch": 1.6791126729628816, + "grad_norm": 2.4186294078826904, + "learning_rate": 1e-06, + "loss": 0.8477, + "mean_token_accuracy": 0.7315466403961182, + "num_tokens": 386994515.0, + "step": 15290 + }, + { + "epoch": 1.6792224906654951, + "grad_norm": 2.5234220027923584, + "learning_rate": 1e-06, + "loss": 0.8876, + "mean_token_accuracy": 0.7156285047531128, + "num_tokens": 387016403.0, + "step": 15291 + }, + { + "epoch": 1.6793323083681089, + "grad_norm": 2.021685838699341, + "learning_rate": 1e-06, + "loss": 0.8307, + "mean_token_accuracy": 0.7437243461608887, + "num_tokens": 387044631.0, + "step": 15292 + }, + { + "epoch": 1.6794421260707226, + "grad_norm": 2.3513143062591553, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7103915214538574, + "num_tokens": 387070617.0, + "step": 15293 + }, + { + "epoch": 1.6795519437733364, + "grad_norm": 2.227010488510132, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7144684791564941, + "num_tokens": 387096935.0, + "step": 15294 + }, + { + "epoch": 1.67966176147595, + "grad_norm": 2.2605812549591064, + "learning_rate": 1e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.7152389883995056, + "num_tokens": 387122296.0, + "step": 15295 + }, + { + "epoch": 1.6797715791785635, + "grad_norm": 2.2750046253204346, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7089574337005615, + "num_tokens": 387146978.0, + "step": 15296 + }, + { + "epoch": 1.6798813968811772, + "grad_norm": 2.4112040996551514, + "learning_rate": 1e-06, + "loss": 0.8261, + "mean_token_accuracy": 0.7468647956848145, + "num_tokens": 387167620.0, + "step": 15297 + }, + { + "epoch": 1.679991214583791, + "grad_norm": 2.170409679412842, + "learning_rate": 1e-06, + "loss": 0.9657, + "mean_token_accuracy": 0.6961740851402283, + "num_tokens": 387195391.0, + "step": 15298 + }, + { + "epoch": 1.6801010322864047, + "grad_norm": 2.391221761703491, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7101125717163086, + "num_tokens": 387219169.0, + "step": 15299 + }, + { + "epoch": 1.6802108499890183, + "grad_norm": 2.050356864929199, + "learning_rate": 1e-06, + "loss": 0.8156, + "mean_token_accuracy": 0.742245614528656, + "num_tokens": 387248069.0, + "step": 15300 + }, + { + "epoch": 1.6803206676916318, + "grad_norm": 2.3576531410217285, + "learning_rate": 1e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.7001081705093384, + "num_tokens": 387275360.0, + "step": 15301 + }, + { + "epoch": 1.6804304853942456, + "grad_norm": 1.9998722076416016, + "learning_rate": 1e-06, + "loss": 1.0043, + "mean_token_accuracy": 0.6897836327552795, + "num_tokens": 387311170.0, + "step": 15302 + }, + { + "epoch": 1.6805403030968593, + "grad_norm": 2.1576387882232666, + "learning_rate": 1e-06, + "loss": 0.8977, + "mean_token_accuracy": 0.7243452072143555, + "num_tokens": 387339760.0, + "step": 15303 + }, + { + "epoch": 1.6806501207994728, + "grad_norm": 2.255861282348633, + "learning_rate": 1e-06, + "loss": 0.8349, + "mean_token_accuracy": 0.7325606346130371, + "num_tokens": 387368049.0, + "step": 15304 + }, + { + "epoch": 1.6807599385020864, + "grad_norm": 2.360793352127075, + "learning_rate": 1e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.7139166593551636, + "num_tokens": 387390393.0, + "step": 15305 + }, + { + "epoch": 1.6808697562047001, + "grad_norm": 2.0726027488708496, + "learning_rate": 1e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.7005549073219299, + "num_tokens": 387420184.0, + "step": 15306 + }, + { + "epoch": 1.680979573907314, + "grad_norm": 1.9482618570327759, + "learning_rate": 1e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.7109531760215759, + "num_tokens": 387452468.0, + "step": 15307 + }, + { + "epoch": 1.6810893916099277, + "grad_norm": 2.408590316772461, + "learning_rate": 1e-06, + "loss": 0.8426, + "mean_token_accuracy": 0.7292729020118713, + "num_tokens": 387473588.0, + "step": 15308 + }, + { + "epoch": 1.6811992093125412, + "grad_norm": 2.59516978263855, + "learning_rate": 1e-06, + "loss": 0.819, + "mean_token_accuracy": 0.7395938634872437, + "num_tokens": 387493227.0, + "step": 15309 + }, + { + "epoch": 1.6813090270151547, + "grad_norm": 2.147412061691284, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.6978122591972351, + "num_tokens": 387523649.0, + "step": 15310 + }, + { + "epoch": 1.6814188447177685, + "grad_norm": 2.1647396087646484, + "learning_rate": 1e-06, + "loss": 0.9859, + "mean_token_accuracy": 0.7016341686248779, + "num_tokens": 387549926.0, + "step": 15311 + }, + { + "epoch": 1.6815286624203822, + "grad_norm": 2.1352367401123047, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7073182463645935, + "num_tokens": 387579166.0, + "step": 15312 + }, + { + "epoch": 1.681638480122996, + "grad_norm": 2.5328385829925537, + "learning_rate": 1e-06, + "loss": 0.7671, + "mean_token_accuracy": 0.7502425312995911, + "num_tokens": 387600477.0, + "step": 15313 + }, + { + "epoch": 1.6817482978256095, + "grad_norm": 2.8176653385162354, + "learning_rate": 1e-06, + "loss": 0.8313, + "mean_token_accuracy": 0.7286840081214905, + "num_tokens": 387617828.0, + "step": 15314 + }, + { + "epoch": 1.681858115528223, + "grad_norm": 2.5712618827819824, + "learning_rate": 1e-06, + "loss": 0.8295, + "mean_token_accuracy": 0.734299898147583, + "num_tokens": 387637001.0, + "step": 15315 + }, + { + "epoch": 1.6819679332308368, + "grad_norm": 2.567046642303467, + "learning_rate": 1e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.7308763861656189, + "num_tokens": 387659332.0, + "step": 15316 + }, + { + "epoch": 1.6820777509334506, + "grad_norm": 2.4188778400421143, + "learning_rate": 1e-06, + "loss": 0.8285, + "mean_token_accuracy": 0.7357898354530334, + "num_tokens": 387682650.0, + "step": 15317 + }, + { + "epoch": 1.6821875686360641, + "grad_norm": 2.029055595397949, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.712671160697937, + "num_tokens": 387714562.0, + "step": 15318 + }, + { + "epoch": 1.6822973863386776, + "grad_norm": 2.2401466369628906, + "learning_rate": 1e-06, + "loss": 1.0214, + "mean_token_accuracy": 0.681695818901062, + "num_tokens": 387742721.0, + "step": 15319 + }, + { + "epoch": 1.6824072040412914, + "grad_norm": 2.487644672393799, + "learning_rate": 1e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7074236273765564, + "num_tokens": 387766023.0, + "step": 15320 + }, + { + "epoch": 1.6825170217439052, + "grad_norm": 2.0906243324279785, + "learning_rate": 1e-06, + "loss": 0.8428, + "mean_token_accuracy": 0.7314828634262085, + "num_tokens": 387794144.0, + "step": 15321 + }, + { + "epoch": 1.682626839446519, + "grad_norm": 2.058767795562744, + "learning_rate": 1e-06, + "loss": 0.9205, + "mean_token_accuracy": 0.7168580293655396, + "num_tokens": 387824529.0, + "step": 15322 + }, + { + "epoch": 1.6827366571491325, + "grad_norm": 2.5044057369232178, + "learning_rate": 1e-06, + "loss": 0.8185, + "mean_token_accuracy": 0.7389012575149536, + "num_tokens": 387845040.0, + "step": 15323 + }, + { + "epoch": 1.682846474851746, + "grad_norm": 2.143303155899048, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7117908596992493, + "num_tokens": 387871180.0, + "step": 15324 + }, + { + "epoch": 1.6829562925543597, + "grad_norm": 2.577070951461792, + "learning_rate": 1e-06, + "loss": 0.7465, + "mean_token_accuracy": 0.7662445306777954, + "num_tokens": 387890117.0, + "step": 15325 + }, + { + "epoch": 1.6830661102569735, + "grad_norm": 2.3737995624542236, + "learning_rate": 1e-06, + "loss": 0.8886, + "mean_token_accuracy": 0.7315963506698608, + "num_tokens": 387912575.0, + "step": 15326 + }, + { + "epoch": 1.683175927959587, + "grad_norm": 2.1763486862182617, + "learning_rate": 1e-06, + "loss": 0.964, + "mean_token_accuracy": 0.6940838694572449, + "num_tokens": 387942970.0, + "step": 15327 + }, + { + "epoch": 1.6832857456622008, + "grad_norm": 2.604789972305298, + "learning_rate": 1e-06, + "loss": 0.8638, + "mean_token_accuracy": 0.7208507657051086, + "num_tokens": 387962650.0, + "step": 15328 + }, + { + "epoch": 1.6833955633648143, + "grad_norm": 2.466977119445801, + "learning_rate": 1e-06, + "loss": 0.8798, + "mean_token_accuracy": 0.7265771627426147, + "num_tokens": 387983306.0, + "step": 15329 + }, + { + "epoch": 1.683505381067428, + "grad_norm": 2.440246820449829, + "learning_rate": 1e-06, + "loss": 0.779, + "mean_token_accuracy": 0.7461099028587341, + "num_tokens": 388003901.0, + "step": 15330 + }, + { + "epoch": 1.6836151987700418, + "grad_norm": 2.322193145751953, + "learning_rate": 1e-06, + "loss": 0.8573, + "mean_token_accuracy": 0.7317975163459778, + "num_tokens": 388026272.0, + "step": 15331 + }, + { + "epoch": 1.6837250164726554, + "grad_norm": 2.0562901496887207, + "learning_rate": 1e-06, + "loss": 0.8882, + "mean_token_accuracy": 0.7277604937553406, + "num_tokens": 388054420.0, + "step": 15332 + }, + { + "epoch": 1.683834834175269, + "grad_norm": 2.2794442176818848, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.7087976336479187, + "num_tokens": 388080164.0, + "step": 15333 + }, + { + "epoch": 1.6839446518778827, + "grad_norm": 2.2471365928649902, + "learning_rate": 1e-06, + "loss": 0.8181, + "mean_token_accuracy": 0.7314152717590332, + "num_tokens": 388103284.0, + "step": 15334 + }, + { + "epoch": 1.6840544695804964, + "grad_norm": 1.9061578512191772, + "learning_rate": 1e-06, + "loss": 0.8883, + "mean_token_accuracy": 0.7284544706344604, + "num_tokens": 388137119.0, + "step": 15335 + }, + { + "epoch": 1.6841642872831102, + "grad_norm": 2.458465576171875, + "learning_rate": 1e-06, + "loss": 0.9191, + "mean_token_accuracy": 0.7177416086196899, + "num_tokens": 388161219.0, + "step": 15336 + }, + { + "epoch": 1.6842741049857237, + "grad_norm": 2.0651118755340576, + "learning_rate": 1e-06, + "loss": 0.878, + "mean_token_accuracy": 0.7254290580749512, + "num_tokens": 388190337.0, + "step": 15337 + }, + { + "epoch": 1.6843839226883373, + "grad_norm": 2.0770676136016846, + "learning_rate": 1e-06, + "loss": 0.878, + "mean_token_accuracy": 0.7290377616882324, + "num_tokens": 388217988.0, + "step": 15338 + }, + { + "epoch": 1.684493740390951, + "grad_norm": 2.185920476913452, + "learning_rate": 1e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.709587037563324, + "num_tokens": 388245925.0, + "step": 15339 + }, + { + "epoch": 1.6846035580935648, + "grad_norm": 2.2416815757751465, + "learning_rate": 1e-06, + "loss": 0.7476, + "mean_token_accuracy": 0.7617874145507812, + "num_tokens": 388270320.0, + "step": 15340 + }, + { + "epoch": 1.6847133757961783, + "grad_norm": 2.401331663131714, + "learning_rate": 1e-06, + "loss": 0.8289, + "mean_token_accuracy": 0.7578601837158203, + "num_tokens": 388291578.0, + "step": 15341 + }, + { + "epoch": 1.684823193498792, + "grad_norm": 2.4312989711761475, + "learning_rate": 1e-06, + "loss": 0.9077, + "mean_token_accuracy": 0.7141028642654419, + "num_tokens": 388315610.0, + "step": 15342 + }, + { + "epoch": 1.6849330112014056, + "grad_norm": 2.182572841644287, + "learning_rate": 1e-06, + "loss": 0.8683, + "mean_token_accuracy": 0.736585259437561, + "num_tokens": 388341166.0, + "step": 15343 + }, + { + "epoch": 1.6850428289040194, + "grad_norm": 2.5955469608306885, + "learning_rate": 1e-06, + "loss": 0.7482, + "mean_token_accuracy": 0.752227246761322, + "num_tokens": 388359513.0, + "step": 15344 + }, + { + "epoch": 1.685152646606633, + "grad_norm": 2.3594162464141846, + "learning_rate": 1e-06, + "loss": 0.7824, + "mean_token_accuracy": 0.7542566061019897, + "num_tokens": 388380364.0, + "step": 15345 + }, + { + "epoch": 1.6852624643092466, + "grad_norm": 2.165013551712036, + "learning_rate": 1e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.7086424231529236, + "num_tokens": 388409016.0, + "step": 15346 + }, + { + "epoch": 1.6853722820118602, + "grad_norm": 2.168813943862915, + "learning_rate": 1e-06, + "loss": 0.8518, + "mean_token_accuracy": 0.7447591423988342, + "num_tokens": 388434812.0, + "step": 15347 + }, + { + "epoch": 1.685482099714474, + "grad_norm": 2.169494152069092, + "learning_rate": 1e-06, + "loss": 0.8682, + "mean_token_accuracy": 0.7299950122833252, + "num_tokens": 388461177.0, + "step": 15348 + }, + { + "epoch": 1.6855919174170877, + "grad_norm": 2.454904079437256, + "learning_rate": 1e-06, + "loss": 0.961, + "mean_token_accuracy": 0.7032438516616821, + "num_tokens": 388484277.0, + "step": 15349 + }, + { + "epoch": 1.6857017351197014, + "grad_norm": 2.2228031158447266, + "learning_rate": 1e-06, + "loss": 0.8205, + "mean_token_accuracy": 0.7405559420585632, + "num_tokens": 388510268.0, + "step": 15350 + }, + { + "epoch": 1.685811552822315, + "grad_norm": 2.129772424697876, + "learning_rate": 1e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.7128430604934692, + "num_tokens": 388538234.0, + "step": 15351 + }, + { + "epoch": 1.6859213705249285, + "grad_norm": 2.2540712356567383, + "learning_rate": 1e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.7148430347442627, + "num_tokens": 388563245.0, + "step": 15352 + }, + { + "epoch": 1.6860311882275423, + "grad_norm": 2.128692150115967, + "learning_rate": 1e-06, + "loss": 0.946, + "mean_token_accuracy": 0.707984983921051, + "num_tokens": 388592324.0, + "step": 15353 + }, + { + "epoch": 1.686141005930156, + "grad_norm": 2.00462007522583, + "learning_rate": 1e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.7015246748924255, + "num_tokens": 388622744.0, + "step": 15354 + }, + { + "epoch": 1.6862508236327696, + "grad_norm": 2.1912269592285156, + "learning_rate": 1e-06, + "loss": 0.9239, + "mean_token_accuracy": 0.721360981464386, + "num_tokens": 388650675.0, + "step": 15355 + }, + { + "epoch": 1.686360641335383, + "grad_norm": 2.3945045471191406, + "learning_rate": 1e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.7220743894577026, + "num_tokens": 388672222.0, + "step": 15356 + }, + { + "epoch": 1.6864704590379969, + "grad_norm": 2.8495428562164307, + "learning_rate": 1e-06, + "loss": 0.8541, + "mean_token_accuracy": 0.7425987124443054, + "num_tokens": 388689215.0, + "step": 15357 + }, + { + "epoch": 1.6865802767406106, + "grad_norm": 2.1243138313293457, + "learning_rate": 1e-06, + "loss": 1.0462, + "mean_token_accuracy": 0.6810480952262878, + "num_tokens": 388719790.0, + "step": 15358 + }, + { + "epoch": 1.6866900944432244, + "grad_norm": 2.4065942764282227, + "learning_rate": 1e-06, + "loss": 0.852, + "mean_token_accuracy": 0.7309629917144775, + "num_tokens": 388741634.0, + "step": 15359 + }, + { + "epoch": 1.686799912145838, + "grad_norm": 2.887019634246826, + "learning_rate": 1e-06, + "loss": 0.8692, + "mean_token_accuracy": 0.7271699905395508, + "num_tokens": 388760276.0, + "step": 15360 + }, + { + "epoch": 1.6869097298484514, + "grad_norm": 2.4775941371917725, + "learning_rate": 1e-06, + "loss": 0.8837, + "mean_token_accuracy": 0.7252726554870605, + "num_tokens": 388783734.0, + "step": 15361 + }, + { + "epoch": 1.6870195475510652, + "grad_norm": 2.177830934524536, + "learning_rate": 1e-06, + "loss": 0.8961, + "mean_token_accuracy": 0.7138540744781494, + "num_tokens": 388810838.0, + "step": 15362 + }, + { + "epoch": 1.687129365253679, + "grad_norm": 2.3362269401550293, + "learning_rate": 1e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7053524255752563, + "num_tokens": 388836629.0, + "step": 15363 + }, + { + "epoch": 1.6872391829562927, + "grad_norm": 2.1340086460113525, + "learning_rate": 1e-06, + "loss": 0.8756, + "mean_token_accuracy": 0.7228699922561646, + "num_tokens": 388864360.0, + "step": 15364 + }, + { + "epoch": 1.6873490006589063, + "grad_norm": 2.420862913131714, + "learning_rate": 1e-06, + "loss": 0.8398, + "mean_token_accuracy": 0.7351951599121094, + "num_tokens": 388888795.0, + "step": 15365 + }, + { + "epoch": 1.6874588183615198, + "grad_norm": 2.0820846557617188, + "learning_rate": 1e-06, + "loss": 1.0254, + "mean_token_accuracy": 0.7032318711280823, + "num_tokens": 388920674.0, + "step": 15366 + }, + { + "epoch": 1.6875686360641335, + "grad_norm": 2.396001100540161, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.6993495225906372, + "num_tokens": 388944160.0, + "step": 15367 + }, + { + "epoch": 1.6876784537667473, + "grad_norm": 2.023305654525757, + "learning_rate": 1e-06, + "loss": 0.8722, + "mean_token_accuracy": 0.7283552885055542, + "num_tokens": 388973064.0, + "step": 15368 + }, + { + "epoch": 1.6877882714693608, + "grad_norm": 2.3823795318603516, + "learning_rate": 1e-06, + "loss": 0.7297, + "mean_token_accuracy": 0.7592499256134033, + "num_tokens": 388994268.0, + "step": 15369 + }, + { + "epoch": 1.6878980891719744, + "grad_norm": 2.4340875148773193, + "learning_rate": 1e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.7196708917617798, + "num_tokens": 389017498.0, + "step": 15370 + }, + { + "epoch": 1.6880079068745881, + "grad_norm": 2.2961103916168213, + "learning_rate": 1e-06, + "loss": 0.9614, + "mean_token_accuracy": 0.7041009664535522, + "num_tokens": 389041622.0, + "step": 15371 + }, + { + "epoch": 1.6881177245772019, + "grad_norm": 2.6146037578582764, + "learning_rate": 1e-06, + "loss": 0.838, + "mean_token_accuracy": 0.7360087633132935, + "num_tokens": 389061328.0, + "step": 15372 + }, + { + "epoch": 1.6882275422798156, + "grad_norm": 1.9849555492401123, + "learning_rate": 1e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7130473852157593, + "num_tokens": 389092214.0, + "step": 15373 + }, + { + "epoch": 1.6883373599824292, + "grad_norm": 2.051028251647949, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7174662351608276, + "num_tokens": 389121937.0, + "step": 15374 + }, + { + "epoch": 1.6884471776850427, + "grad_norm": 2.1998817920684814, + "learning_rate": 1e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.7034368515014648, + "num_tokens": 389148916.0, + "step": 15375 + }, + { + "epoch": 1.6885569953876565, + "grad_norm": 2.573758840560913, + "learning_rate": 1e-06, + "loss": 0.8588, + "mean_token_accuracy": 0.7229912877082825, + "num_tokens": 389170678.0, + "step": 15376 + }, + { + "epoch": 1.6886668130902702, + "grad_norm": 2.2537224292755127, + "learning_rate": 1e-06, + "loss": 0.9219, + "mean_token_accuracy": 0.7151552438735962, + "num_tokens": 389197269.0, + "step": 15377 + }, + { + "epoch": 1.6887766307928838, + "grad_norm": 2.126288890838623, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7029272317886353, + "num_tokens": 389228247.0, + "step": 15378 + }, + { + "epoch": 1.6888864484954975, + "grad_norm": 2.3861262798309326, + "learning_rate": 1e-06, + "loss": 0.8967, + "mean_token_accuracy": 0.7168139219284058, + "num_tokens": 389253357.0, + "step": 15379 + }, + { + "epoch": 1.688996266198111, + "grad_norm": 2.5587222576141357, + "learning_rate": 1e-06, + "loss": 0.8541, + "mean_token_accuracy": 0.7339863777160645, + "num_tokens": 389273721.0, + "step": 15380 + }, + { + "epoch": 1.6891060839007248, + "grad_norm": 2.207768201828003, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7225908041000366, + "num_tokens": 389301723.0, + "step": 15381 + }, + { + "epoch": 1.6892159016033386, + "grad_norm": 2.2265806198120117, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7111350297927856, + "num_tokens": 389326838.0, + "step": 15382 + }, + { + "epoch": 1.689325719305952, + "grad_norm": 1.9002147912979126, + "learning_rate": 1e-06, + "loss": 0.969, + "mean_token_accuracy": 0.701852560043335, + "num_tokens": 389363587.0, + "step": 15383 + }, + { + "epoch": 1.6894355370085656, + "grad_norm": 2.1526334285736084, + "learning_rate": 1e-06, + "loss": 0.9349, + "mean_token_accuracy": 0.714311957359314, + "num_tokens": 389391181.0, + "step": 15384 + }, + { + "epoch": 1.6895453547111794, + "grad_norm": 2.286067247390747, + "learning_rate": 1e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.7071768045425415, + "num_tokens": 389417907.0, + "step": 15385 + }, + { + "epoch": 1.6896551724137931, + "grad_norm": 2.350609064102173, + "learning_rate": 1e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.7016375064849854, + "num_tokens": 389441758.0, + "step": 15386 + }, + { + "epoch": 1.689764990116407, + "grad_norm": 2.1660473346710205, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7097758054733276, + "num_tokens": 389470029.0, + "step": 15387 + }, + { + "epoch": 1.6898748078190204, + "grad_norm": 2.5346767902374268, + "learning_rate": 1e-06, + "loss": 0.927, + "mean_token_accuracy": 0.7075780630111694, + "num_tokens": 389491934.0, + "step": 15388 + }, + { + "epoch": 1.689984625521634, + "grad_norm": 2.29404616355896, + "learning_rate": 1e-06, + "loss": 0.8318, + "mean_token_accuracy": 0.7359843850135803, + "num_tokens": 389516011.0, + "step": 15389 + }, + { + "epoch": 1.6900944432242477, + "grad_norm": 2.4095299243927, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.7081657648086548, + "num_tokens": 389538850.0, + "step": 15390 + }, + { + "epoch": 1.6902042609268615, + "grad_norm": 2.2270069122314453, + "learning_rate": 1e-06, + "loss": 0.8877, + "mean_token_accuracy": 0.7267171144485474, + "num_tokens": 389564955.0, + "step": 15391 + }, + { + "epoch": 1.690314078629475, + "grad_norm": 1.8884146213531494, + "learning_rate": 1e-06, + "loss": 0.8567, + "mean_token_accuracy": 0.7250962257385254, + "num_tokens": 389597199.0, + "step": 15392 + }, + { + "epoch": 1.6904238963320888, + "grad_norm": 2.087405204772949, + "learning_rate": 1e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.7169573903083801, + "num_tokens": 389627417.0, + "step": 15393 + }, + { + "epoch": 1.6905337140347023, + "grad_norm": 2.423266887664795, + "learning_rate": 1e-06, + "loss": 0.8581, + "mean_token_accuracy": 0.7302257418632507, + "num_tokens": 389650291.0, + "step": 15394 + }, + { + "epoch": 1.690643531737316, + "grad_norm": 2.4815399646759033, + "learning_rate": 1e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.7112059593200684, + "num_tokens": 389672577.0, + "step": 15395 + }, + { + "epoch": 1.6907533494399298, + "grad_norm": 2.289153575897217, + "learning_rate": 1e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.7277709245681763, + "num_tokens": 389696531.0, + "step": 15396 + }, + { + "epoch": 1.6908631671425434, + "grad_norm": 2.2061266899108887, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.703776478767395, + "num_tokens": 389722139.0, + "step": 15397 + }, + { + "epoch": 1.690972984845157, + "grad_norm": 2.046656608581543, + "learning_rate": 1e-06, + "loss": 1.003, + "mean_token_accuracy": 0.6943259239196777, + "num_tokens": 389756482.0, + "step": 15398 + }, + { + "epoch": 1.6910828025477707, + "grad_norm": 2.359262228012085, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7198994159698486, + "num_tokens": 389780905.0, + "step": 15399 + }, + { + "epoch": 1.6911926202503844, + "grad_norm": 2.1511192321777344, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7093018889427185, + "num_tokens": 389808170.0, + "step": 15400 + }, + { + "epoch": 1.6913024379529982, + "grad_norm": 2.265725612640381, + "learning_rate": 1e-06, + "loss": 0.932, + "mean_token_accuracy": 0.7105156779289246, + "num_tokens": 389834426.0, + "step": 15401 + }, + { + "epoch": 1.6914122556556117, + "grad_norm": 2.225313186645508, + "learning_rate": 1e-06, + "loss": 0.8699, + "mean_token_accuracy": 0.7252861261367798, + "num_tokens": 389860635.0, + "step": 15402 + }, + { + "epoch": 1.6915220733582252, + "grad_norm": 2.269670009613037, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7216572761535645, + "num_tokens": 389887358.0, + "step": 15403 + }, + { + "epoch": 1.691631891060839, + "grad_norm": 1.9475294351577759, + "learning_rate": 1e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.7023743391036987, + "num_tokens": 389921840.0, + "step": 15404 + }, + { + "epoch": 1.6917417087634528, + "grad_norm": 2.140995979309082, + "learning_rate": 1e-06, + "loss": 0.8663, + "mean_token_accuracy": 0.7205625772476196, + "num_tokens": 389949417.0, + "step": 15405 + }, + { + "epoch": 1.6918515264660663, + "grad_norm": 2.192967653274536, + "learning_rate": 1e-06, + "loss": 0.9341, + "mean_token_accuracy": 0.7160439491271973, + "num_tokens": 389976727.0, + "step": 15406 + }, + { + "epoch": 1.6919613441686798, + "grad_norm": 1.904758334159851, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.6993560194969177, + "num_tokens": 390010753.0, + "step": 15407 + }, + { + "epoch": 1.6920711618712936, + "grad_norm": 2.447617292404175, + "learning_rate": 1e-06, + "loss": 0.8692, + "mean_token_accuracy": 0.7276277542114258, + "num_tokens": 390032834.0, + "step": 15408 + }, + { + "epoch": 1.6921809795739073, + "grad_norm": 2.3428080081939697, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7146254181861877, + "num_tokens": 390060781.0, + "step": 15409 + }, + { + "epoch": 1.692290797276521, + "grad_norm": 1.9780285358428955, + "learning_rate": 1e-06, + "loss": 0.8782, + "mean_token_accuracy": 0.7244671583175659, + "num_tokens": 390091275.0, + "step": 15410 + }, + { + "epoch": 1.6924006149791346, + "grad_norm": 1.9298771619796753, + "learning_rate": 1e-06, + "loss": 0.9733, + "mean_token_accuracy": 0.7014628052711487, + "num_tokens": 390125282.0, + "step": 15411 + }, + { + "epoch": 1.6925104326817482, + "grad_norm": 2.28157377243042, + "learning_rate": 1e-06, + "loss": 0.8518, + "mean_token_accuracy": 0.7279379367828369, + "num_tokens": 390148340.0, + "step": 15412 + }, + { + "epoch": 1.692620250384362, + "grad_norm": 2.277073383331299, + "learning_rate": 1e-06, + "loss": 1.007, + "mean_token_accuracy": 0.7002968788146973, + "num_tokens": 390175786.0, + "step": 15413 + }, + { + "epoch": 1.6927300680869757, + "grad_norm": 1.930428385734558, + "learning_rate": 1e-06, + "loss": 0.8125, + "mean_token_accuracy": 0.7436500787734985, + "num_tokens": 390206286.0, + "step": 15414 + }, + { + "epoch": 1.6928398857895894, + "grad_norm": 2.4178497791290283, + "learning_rate": 1e-06, + "loss": 0.8761, + "mean_token_accuracy": 0.7338718175888062, + "num_tokens": 390228084.0, + "step": 15415 + }, + { + "epoch": 1.692949703492203, + "grad_norm": 2.063462018966675, + "learning_rate": 1e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7185337543487549, + "num_tokens": 390254687.0, + "step": 15416 + }, + { + "epoch": 1.6930595211948165, + "grad_norm": 2.3713366985321045, + "learning_rate": 1e-06, + "loss": 0.8361, + "mean_token_accuracy": 0.7329951524734497, + "num_tokens": 390275905.0, + "step": 15417 + }, + { + "epoch": 1.6931693388974303, + "grad_norm": 2.195817708969116, + "learning_rate": 1e-06, + "loss": 0.8339, + "mean_token_accuracy": 0.7292243838310242, + "num_tokens": 390301061.0, + "step": 15418 + }, + { + "epoch": 1.693279156600044, + "grad_norm": 2.1940720081329346, + "learning_rate": 1e-06, + "loss": 0.852, + "mean_token_accuracy": 0.7372676730155945, + "num_tokens": 390326309.0, + "step": 15419 + }, + { + "epoch": 1.6933889743026576, + "grad_norm": 2.3888020515441895, + "learning_rate": 1e-06, + "loss": 0.8326, + "mean_token_accuracy": 0.7333210110664368, + "num_tokens": 390347316.0, + "step": 15420 + }, + { + "epoch": 1.693498792005271, + "grad_norm": 2.942737102508545, + "learning_rate": 1e-06, + "loss": 0.8022, + "mean_token_accuracy": 0.7401316165924072, + "num_tokens": 390364167.0, + "step": 15421 + }, + { + "epoch": 1.6936086097078848, + "grad_norm": 1.861725091934204, + "learning_rate": 1e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.6996665000915527, + "num_tokens": 390400601.0, + "step": 15422 + }, + { + "epoch": 1.6937184274104986, + "grad_norm": 2.3734729290008545, + "learning_rate": 1e-06, + "loss": 0.9089, + "mean_token_accuracy": 0.7137957811355591, + "num_tokens": 390423783.0, + "step": 15423 + }, + { + "epoch": 1.6938282451131124, + "grad_norm": 2.2965338230133057, + "learning_rate": 1e-06, + "loss": 0.9773, + "mean_token_accuracy": 0.6989654898643494, + "num_tokens": 390448678.0, + "step": 15424 + }, + { + "epoch": 1.693938062815726, + "grad_norm": 2.1670613288879395, + "learning_rate": 1e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.7154155373573303, + "num_tokens": 390477684.0, + "step": 15425 + }, + { + "epoch": 1.6940478805183394, + "grad_norm": 2.1853113174438477, + "learning_rate": 1e-06, + "loss": 0.9511, + "mean_token_accuracy": 0.7051858901977539, + "num_tokens": 390503967.0, + "step": 15426 + }, + { + "epoch": 1.6941576982209532, + "grad_norm": 2.401186227798462, + "learning_rate": 1e-06, + "loss": 0.8473, + "mean_token_accuracy": 0.7370673418045044, + "num_tokens": 390525887.0, + "step": 15427 + }, + { + "epoch": 1.694267515923567, + "grad_norm": 2.418700695037842, + "learning_rate": 1e-06, + "loss": 0.7938, + "mean_token_accuracy": 0.7450371980667114, + "num_tokens": 390549095.0, + "step": 15428 + }, + { + "epoch": 1.6943773336261807, + "grad_norm": 2.4842240810394287, + "learning_rate": 1e-06, + "loss": 0.8508, + "mean_token_accuracy": 0.7292791604995728, + "num_tokens": 390571553.0, + "step": 15429 + }, + { + "epoch": 1.6944871513287942, + "grad_norm": 2.1873302459716797, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7169701457023621, + "num_tokens": 390598203.0, + "step": 15430 + }, + { + "epoch": 1.6945969690314078, + "grad_norm": 2.2098569869995117, + "learning_rate": 1e-06, + "loss": 0.7793, + "mean_token_accuracy": 0.7500821352005005, + "num_tokens": 390623596.0, + "step": 15431 + }, + { + "epoch": 1.6947067867340215, + "grad_norm": 2.174182415008545, + "learning_rate": 1e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.7237348556518555, + "num_tokens": 390651752.0, + "step": 15432 + }, + { + "epoch": 1.6948166044366353, + "grad_norm": 2.1211800575256348, + "learning_rate": 1e-06, + "loss": 0.8882, + "mean_token_accuracy": 0.7212287783622742, + "num_tokens": 390678218.0, + "step": 15433 + }, + { + "epoch": 1.6949264221392488, + "grad_norm": 2.2275609970092773, + "learning_rate": 1e-06, + "loss": 0.878, + "mean_token_accuracy": 0.7211617231369019, + "num_tokens": 390702910.0, + "step": 15434 + }, + { + "epoch": 1.6950362398418624, + "grad_norm": 2.064809799194336, + "learning_rate": 1e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.6977541446685791, + "num_tokens": 390732822.0, + "step": 15435 + }, + { + "epoch": 1.6951460575444761, + "grad_norm": 2.3942930698394775, + "learning_rate": 1e-06, + "loss": 0.9191, + "mean_token_accuracy": 0.7109309434890747, + "num_tokens": 390755795.0, + "step": 15436 + }, + { + "epoch": 1.6952558752470899, + "grad_norm": 2.5523951053619385, + "learning_rate": 1e-06, + "loss": 0.9041, + "mean_token_accuracy": 0.7150465250015259, + "num_tokens": 390776983.0, + "step": 15437 + }, + { + "epoch": 1.6953656929497036, + "grad_norm": 2.1166858673095703, + "learning_rate": 1e-06, + "loss": 0.9455, + "mean_token_accuracy": 0.7060757875442505, + "num_tokens": 390801969.0, + "step": 15438 + }, + { + "epoch": 1.6954755106523172, + "grad_norm": 2.0540342330932617, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.7066254615783691, + "num_tokens": 390833234.0, + "step": 15439 + }, + { + "epoch": 1.6955853283549307, + "grad_norm": 2.138361930847168, + "learning_rate": 1e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7184821367263794, + "num_tokens": 390858802.0, + "step": 15440 + }, + { + "epoch": 1.6956951460575445, + "grad_norm": 2.2601704597473145, + "learning_rate": 1e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.7167160511016846, + "num_tokens": 390884687.0, + "step": 15441 + }, + { + "epoch": 1.6958049637601582, + "grad_norm": 2.661911964416504, + "learning_rate": 1e-06, + "loss": 0.8416, + "mean_token_accuracy": 0.7379089593887329, + "num_tokens": 390904996.0, + "step": 15442 + }, + { + "epoch": 1.6959147814627717, + "grad_norm": 2.266831636428833, + "learning_rate": 1e-06, + "loss": 0.896, + "mean_token_accuracy": 0.7252751588821411, + "num_tokens": 390929301.0, + "step": 15443 + }, + { + "epoch": 1.6960245991653855, + "grad_norm": 2.4229283332824707, + "learning_rate": 1e-06, + "loss": 0.8757, + "mean_token_accuracy": 0.7190521359443665, + "num_tokens": 390952232.0, + "step": 15444 + }, + { + "epoch": 1.696134416867999, + "grad_norm": 2.5653364658355713, + "learning_rate": 1e-06, + "loss": 0.8826, + "mean_token_accuracy": 0.7219526171684265, + "num_tokens": 390973194.0, + "step": 15445 + }, + { + "epoch": 1.6962442345706128, + "grad_norm": 2.2471468448638916, + "learning_rate": 1e-06, + "loss": 0.7553, + "mean_token_accuracy": 0.7588136196136475, + "num_tokens": 390997523.0, + "step": 15446 + }, + { + "epoch": 1.6963540522732266, + "grad_norm": 2.0986948013305664, + "learning_rate": 1e-06, + "loss": 0.9046, + "mean_token_accuracy": 0.7215925455093384, + "num_tokens": 391023516.0, + "step": 15447 + }, + { + "epoch": 1.69646386997584, + "grad_norm": 2.264078140258789, + "learning_rate": 1e-06, + "loss": 0.8177, + "mean_token_accuracy": 0.7385005950927734, + "num_tokens": 391046899.0, + "step": 15448 + }, + { + "epoch": 1.6965736876784536, + "grad_norm": 2.2723183631896973, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7168992757797241, + "num_tokens": 391071707.0, + "step": 15449 + }, + { + "epoch": 1.6966835053810674, + "grad_norm": 2.0385093688964844, + "learning_rate": 1e-06, + "loss": 0.842, + "mean_token_accuracy": 0.735512375831604, + "num_tokens": 391101467.0, + "step": 15450 + }, + { + "epoch": 1.6967933230836811, + "grad_norm": 2.1404988765716553, + "learning_rate": 1e-06, + "loss": 0.8727, + "mean_token_accuracy": 0.7293854355812073, + "num_tokens": 391127150.0, + "step": 15451 + }, + { + "epoch": 1.696903140786295, + "grad_norm": 2.480377435684204, + "learning_rate": 1e-06, + "loss": 0.7755, + "mean_token_accuracy": 0.7544177770614624, + "num_tokens": 391147546.0, + "step": 15452 + }, + { + "epoch": 1.6970129584889084, + "grad_norm": 1.9810056686401367, + "learning_rate": 1e-06, + "loss": 0.939, + "mean_token_accuracy": 0.7042363882064819, + "num_tokens": 391178440.0, + "step": 15453 + }, + { + "epoch": 1.697122776191522, + "grad_norm": 2.2995705604553223, + "learning_rate": 1e-06, + "loss": 0.8922, + "mean_token_accuracy": 0.731990396976471, + "num_tokens": 391203656.0, + "step": 15454 + }, + { + "epoch": 1.6972325938941357, + "grad_norm": 2.2837469577789307, + "learning_rate": 1e-06, + "loss": 0.8787, + "mean_token_accuracy": 0.726309061050415, + "num_tokens": 391230137.0, + "step": 15455 + }, + { + "epoch": 1.6973424115967495, + "grad_norm": 2.4445841312408447, + "learning_rate": 1e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.7210022807121277, + "num_tokens": 391252554.0, + "step": 15456 + }, + { + "epoch": 1.697452229299363, + "grad_norm": 2.575758695602417, + "learning_rate": 1e-06, + "loss": 0.8388, + "mean_token_accuracy": 0.7396887540817261, + "num_tokens": 391272513.0, + "step": 15457 + }, + { + "epoch": 1.6975620470019768, + "grad_norm": 2.0770273208618164, + "learning_rate": 1e-06, + "loss": 0.944, + "mean_token_accuracy": 0.7085592150688171, + "num_tokens": 391301231.0, + "step": 15458 + }, + { + "epoch": 1.6976718647045903, + "grad_norm": 2.417590856552124, + "learning_rate": 1e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.704595685005188, + "num_tokens": 391323853.0, + "step": 15459 + }, + { + "epoch": 1.697781682407204, + "grad_norm": 2.4412930011749268, + "learning_rate": 1e-06, + "loss": 0.8175, + "mean_token_accuracy": 0.737842321395874, + "num_tokens": 391345853.0, + "step": 15460 + }, + { + "epoch": 1.6978915001098178, + "grad_norm": 2.472776174545288, + "learning_rate": 1e-06, + "loss": 0.8621, + "mean_token_accuracy": 0.7301512956619263, + "num_tokens": 391368491.0, + "step": 15461 + }, + { + "epoch": 1.6980013178124314, + "grad_norm": 2.236570119857788, + "learning_rate": 1e-06, + "loss": 0.9077, + "mean_token_accuracy": 0.7140514850616455, + "num_tokens": 391393790.0, + "step": 15462 + }, + { + "epoch": 1.698111135515045, + "grad_norm": 2.55688738822937, + "learning_rate": 1e-06, + "loss": 0.8846, + "mean_token_accuracy": 0.7238444685935974, + "num_tokens": 391416690.0, + "step": 15463 + }, + { + "epoch": 1.6982209532176586, + "grad_norm": 1.9444692134857178, + "learning_rate": 1e-06, + "loss": 0.8634, + "mean_token_accuracy": 0.7331186532974243, + "num_tokens": 391445713.0, + "step": 15464 + }, + { + "epoch": 1.6983307709202724, + "grad_norm": 2.329359531402588, + "learning_rate": 1e-06, + "loss": 0.8971, + "mean_token_accuracy": 0.7209181189537048, + "num_tokens": 391468162.0, + "step": 15465 + }, + { + "epoch": 1.6984405886228862, + "grad_norm": 2.252384901046753, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.713098406791687, + "num_tokens": 391493181.0, + "step": 15466 + }, + { + "epoch": 1.6985504063254997, + "grad_norm": 2.2963199615478516, + "learning_rate": 1e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7095164060592651, + "num_tokens": 391518179.0, + "step": 15467 + }, + { + "epoch": 1.6986602240281132, + "grad_norm": 2.254988670349121, + "learning_rate": 1e-06, + "loss": 0.8697, + "mean_token_accuracy": 0.7258890867233276, + "num_tokens": 391544945.0, + "step": 15468 + }, + { + "epoch": 1.698770041730727, + "grad_norm": 1.894162654876709, + "learning_rate": 1e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.7199310660362244, + "num_tokens": 391578688.0, + "step": 15469 + }, + { + "epoch": 1.6988798594333407, + "grad_norm": 1.946428656578064, + "learning_rate": 1e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.7071355581283569, + "num_tokens": 391609017.0, + "step": 15470 + }, + { + "epoch": 1.6989896771359543, + "grad_norm": 2.3948261737823486, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7112870812416077, + "num_tokens": 391632161.0, + "step": 15471 + }, + { + "epoch": 1.6990994948385678, + "grad_norm": 2.0152156352996826, + "learning_rate": 1e-06, + "loss": 0.9172, + "mean_token_accuracy": 0.7070695757865906, + "num_tokens": 391661712.0, + "step": 15472 + }, + { + "epoch": 1.6992093125411816, + "grad_norm": 2.12593150138855, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.7026223540306091, + "num_tokens": 391688371.0, + "step": 15473 + }, + { + "epoch": 1.6993191302437953, + "grad_norm": 2.7673425674438477, + "learning_rate": 1e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.7140623331069946, + "num_tokens": 391706294.0, + "step": 15474 + }, + { + "epoch": 1.699428947946409, + "grad_norm": 2.434856653213501, + "learning_rate": 1e-06, + "loss": 0.8083, + "mean_token_accuracy": 0.7471858859062195, + "num_tokens": 391727605.0, + "step": 15475 + }, + { + "epoch": 1.6995387656490226, + "grad_norm": 2.1532347202301025, + "learning_rate": 1e-06, + "loss": 0.8478, + "mean_token_accuracy": 0.7336007356643677, + "num_tokens": 391752791.0, + "step": 15476 + }, + { + "epoch": 1.6996485833516362, + "grad_norm": 2.362553358078003, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7062580585479736, + "num_tokens": 391777383.0, + "step": 15477 + }, + { + "epoch": 1.69975840105425, + "grad_norm": 2.359128713607788, + "learning_rate": 1e-06, + "loss": 0.8654, + "mean_token_accuracy": 0.7214187383651733, + "num_tokens": 391798776.0, + "step": 15478 + }, + { + "epoch": 1.6998682187568637, + "grad_norm": 2.514347791671753, + "learning_rate": 1e-06, + "loss": 0.8233, + "mean_token_accuracy": 0.7379608154296875, + "num_tokens": 391819011.0, + "step": 15479 + }, + { + "epoch": 1.6999780364594774, + "grad_norm": 2.4272658824920654, + "learning_rate": 1e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.7189466953277588, + "num_tokens": 391842151.0, + "step": 15480 + }, + { + "epoch": 1.700087854162091, + "grad_norm": 2.6150009632110596, + "learning_rate": 1e-06, + "loss": 0.7898, + "mean_token_accuracy": 0.7482153177261353, + "num_tokens": 391861116.0, + "step": 15481 + }, + { + "epoch": 1.7001976718647045, + "grad_norm": 2.183166980743408, + "learning_rate": 1e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.7152439951896667, + "num_tokens": 391887834.0, + "step": 15482 + }, + { + "epoch": 1.7003074895673183, + "grad_norm": 2.4326372146606445, + "learning_rate": 1e-06, + "loss": 0.8082, + "mean_token_accuracy": 0.7438664436340332, + "num_tokens": 391908765.0, + "step": 15483 + }, + { + "epoch": 1.700417307269932, + "grad_norm": 2.2321507930755615, + "learning_rate": 1e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.70860755443573, + "num_tokens": 391934582.0, + "step": 15484 + }, + { + "epoch": 1.7005271249725455, + "grad_norm": 1.938101053237915, + "learning_rate": 1e-06, + "loss": 0.895, + "mean_token_accuracy": 0.7193160057067871, + "num_tokens": 391965054.0, + "step": 15485 + }, + { + "epoch": 1.700636942675159, + "grad_norm": 1.9225353002548218, + "learning_rate": 1e-06, + "loss": 0.979, + "mean_token_accuracy": 0.7045179009437561, + "num_tokens": 391998572.0, + "step": 15486 + }, + { + "epoch": 1.7007467603777728, + "grad_norm": 2.235550880432129, + "learning_rate": 1e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7243819236755371, + "num_tokens": 392023793.0, + "step": 15487 + }, + { + "epoch": 1.7008565780803866, + "grad_norm": 2.42671275138855, + "learning_rate": 1e-06, + "loss": 0.8915, + "mean_token_accuracy": 0.7262696623802185, + "num_tokens": 392046527.0, + "step": 15488 + }, + { + "epoch": 1.7009663957830004, + "grad_norm": 2.3488423824310303, + "learning_rate": 1e-06, + "loss": 0.87, + "mean_token_accuracy": 0.731303334236145, + "num_tokens": 392069960.0, + "step": 15489 + }, + { + "epoch": 1.7010762134856139, + "grad_norm": 2.4386026859283447, + "learning_rate": 1e-06, + "loss": 0.8829, + "mean_token_accuracy": 0.7312073111534119, + "num_tokens": 392093081.0, + "step": 15490 + }, + { + "epoch": 1.7011860311882274, + "grad_norm": 2.0345606803894043, + "learning_rate": 1e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.6943842768669128, + "num_tokens": 392126697.0, + "step": 15491 + }, + { + "epoch": 1.7012958488908412, + "grad_norm": 2.1522440910339355, + "learning_rate": 1e-06, + "loss": 0.8749, + "mean_token_accuracy": 0.7273035049438477, + "num_tokens": 392150971.0, + "step": 15492 + }, + { + "epoch": 1.701405666593455, + "grad_norm": 2.161151647567749, + "learning_rate": 1e-06, + "loss": 0.8489, + "mean_token_accuracy": 0.7352465391159058, + "num_tokens": 392175490.0, + "step": 15493 + }, + { + "epoch": 1.7015154842960687, + "grad_norm": 2.275700092315674, + "learning_rate": 1e-06, + "loss": 0.8767, + "mean_token_accuracy": 0.7288792133331299, + "num_tokens": 392201817.0, + "step": 15494 + }, + { + "epoch": 1.7016253019986822, + "grad_norm": 2.217656135559082, + "learning_rate": 1e-06, + "loss": 0.9244, + "mean_token_accuracy": 0.7105851769447327, + "num_tokens": 392229331.0, + "step": 15495 + }, + { + "epoch": 1.7017351197012958, + "grad_norm": 2.313814401626587, + "learning_rate": 1e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.7134054899215698, + "num_tokens": 392254062.0, + "step": 15496 + }, + { + "epoch": 1.7018449374039095, + "grad_norm": 2.380178213119507, + "learning_rate": 1e-06, + "loss": 0.8342, + "mean_token_accuracy": 0.7403166890144348, + "num_tokens": 392275440.0, + "step": 15497 + }, + { + "epoch": 1.7019547551065233, + "grad_norm": 2.174487352371216, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7179855704307556, + "num_tokens": 392301207.0, + "step": 15498 + }, + { + "epoch": 1.7020645728091368, + "grad_norm": 2.281154155731201, + "learning_rate": 1e-06, + "loss": 0.818, + "mean_token_accuracy": 0.7484339475631714, + "num_tokens": 392325002.0, + "step": 15499 + }, + { + "epoch": 1.7021743905117503, + "grad_norm": 2.082353115081787, + "learning_rate": 1e-06, + "loss": 0.9882, + "mean_token_accuracy": 0.6982213258743286, + "num_tokens": 392352336.0, + "step": 15500 + }, + { + "epoch": 1.702284208214364, + "grad_norm": 2.2228569984436035, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.71580970287323, + "num_tokens": 392377341.0, + "step": 15501 + }, + { + "epoch": 1.7023940259169779, + "grad_norm": 2.3104188442230225, + "learning_rate": 1e-06, + "loss": 0.8276, + "mean_token_accuracy": 0.7440770864486694, + "num_tokens": 392400892.0, + "step": 15502 + }, + { + "epoch": 1.7025038436195916, + "grad_norm": 2.020742177963257, + "learning_rate": 1e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.6882293820381165, + "num_tokens": 392432266.0, + "step": 15503 + }, + { + "epoch": 1.7026136613222052, + "grad_norm": 2.360928773880005, + "learning_rate": 1e-06, + "loss": 0.9247, + "mean_token_accuracy": 0.7135756015777588, + "num_tokens": 392457577.0, + "step": 15504 + }, + { + "epoch": 1.7027234790248187, + "grad_norm": 2.7056453227996826, + "learning_rate": 1e-06, + "loss": 0.8634, + "mean_token_accuracy": 0.7294671535491943, + "num_tokens": 392476977.0, + "step": 15505 + }, + { + "epoch": 1.7028332967274324, + "grad_norm": 2.6315135955810547, + "learning_rate": 1e-06, + "loss": 0.8742, + "mean_token_accuracy": 0.7285637855529785, + "num_tokens": 392498039.0, + "step": 15506 + }, + { + "epoch": 1.7029431144300462, + "grad_norm": 2.055500030517578, + "learning_rate": 1e-06, + "loss": 0.9042, + "mean_token_accuracy": 0.7150037884712219, + "num_tokens": 392527142.0, + "step": 15507 + }, + { + "epoch": 1.7030529321326597, + "grad_norm": 2.2762093544006348, + "learning_rate": 1e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7159830331802368, + "num_tokens": 392553716.0, + "step": 15508 + }, + { + "epoch": 1.7031627498352735, + "grad_norm": 2.4094133377075195, + "learning_rate": 1e-06, + "loss": 0.8094, + "mean_token_accuracy": 0.7445390224456787, + "num_tokens": 392576281.0, + "step": 15509 + }, + { + "epoch": 1.703272567537887, + "grad_norm": 2.2651214599609375, + "learning_rate": 1e-06, + "loss": 0.8315, + "mean_token_accuracy": 0.7391365170478821, + "num_tokens": 392599507.0, + "step": 15510 + }, + { + "epoch": 1.7033823852405008, + "grad_norm": 1.8622344732284546, + "learning_rate": 1e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7158094048500061, + "num_tokens": 392634963.0, + "step": 15511 + }, + { + "epoch": 1.7034922029431145, + "grad_norm": 2.4814534187316895, + "learning_rate": 1e-06, + "loss": 0.848, + "mean_token_accuracy": 0.7342138290405273, + "num_tokens": 392657453.0, + "step": 15512 + }, + { + "epoch": 1.703602020645728, + "grad_norm": 2.7783970832824707, + "learning_rate": 1e-06, + "loss": 0.8427, + "mean_token_accuracy": 0.7421954870223999, + "num_tokens": 392674234.0, + "step": 15513 + }, + { + "epoch": 1.7037118383483416, + "grad_norm": 2.386770009994507, + "learning_rate": 1e-06, + "loss": 0.8772, + "mean_token_accuracy": 0.7212917804718018, + "num_tokens": 392699242.0, + "step": 15514 + }, + { + "epoch": 1.7038216560509554, + "grad_norm": 2.2639801502227783, + "learning_rate": 1e-06, + "loss": 0.8467, + "mean_token_accuracy": 0.7348716855049133, + "num_tokens": 392722945.0, + "step": 15515 + }, + { + "epoch": 1.7039314737535691, + "grad_norm": 2.2257697582244873, + "learning_rate": 1e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7059674263000488, + "num_tokens": 392748445.0, + "step": 15516 + }, + { + "epoch": 1.7040412914561829, + "grad_norm": 2.33616304397583, + "learning_rate": 1e-06, + "loss": 0.8793, + "mean_token_accuracy": 0.7212462425231934, + "num_tokens": 392770136.0, + "step": 15517 + }, + { + "epoch": 1.7041511091587964, + "grad_norm": 2.317945718765259, + "learning_rate": 1e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.7153413891792297, + "num_tokens": 392794928.0, + "step": 15518 + }, + { + "epoch": 1.70426092686141, + "grad_norm": 2.5946645736694336, + "learning_rate": 1e-06, + "loss": 0.8872, + "mean_token_accuracy": 0.7353214025497437, + "num_tokens": 392813976.0, + "step": 15519 + }, + { + "epoch": 1.7043707445640237, + "grad_norm": 2.259796619415283, + "learning_rate": 1e-06, + "loss": 0.96, + "mean_token_accuracy": 0.7133095264434814, + "num_tokens": 392840224.0, + "step": 15520 + }, + { + "epoch": 1.7044805622666375, + "grad_norm": 2.303820848464966, + "learning_rate": 1e-06, + "loss": 0.8968, + "mean_token_accuracy": 0.7181140184402466, + "num_tokens": 392864013.0, + "step": 15521 + }, + { + "epoch": 1.704590379969251, + "grad_norm": 2.184166669845581, + "learning_rate": 1e-06, + "loss": 0.8598, + "mean_token_accuracy": 0.7246394157409668, + "num_tokens": 392889042.0, + "step": 15522 + }, + { + "epoch": 1.7047001976718648, + "grad_norm": 2.1464622020721436, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7100321650505066, + "num_tokens": 392918392.0, + "step": 15523 + }, + { + "epoch": 1.7048100153744783, + "grad_norm": 2.397263765335083, + "learning_rate": 1e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.7124751806259155, + "num_tokens": 392943127.0, + "step": 15524 + }, + { + "epoch": 1.704919833077092, + "grad_norm": 1.9958829879760742, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7183959484100342, + "num_tokens": 392975556.0, + "step": 15525 + }, + { + "epoch": 1.7050296507797058, + "grad_norm": 2.483727216720581, + "learning_rate": 1e-06, + "loss": 0.8872, + "mean_token_accuracy": 0.7250989675521851, + "num_tokens": 392996303.0, + "step": 15526 + }, + { + "epoch": 1.7051394684823193, + "grad_norm": 2.2900984287261963, + "learning_rate": 1e-06, + "loss": 0.8876, + "mean_token_accuracy": 0.7215223908424377, + "num_tokens": 393021428.0, + "step": 15527 + }, + { + "epoch": 1.7052492861849329, + "grad_norm": 2.2625021934509277, + "learning_rate": 1e-06, + "loss": 0.8396, + "mean_token_accuracy": 0.7386461496353149, + "num_tokens": 393044446.0, + "step": 15528 + }, + { + "epoch": 1.7053591038875466, + "grad_norm": 2.3966782093048096, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7126351594924927, + "num_tokens": 393065834.0, + "step": 15529 + }, + { + "epoch": 1.7054689215901604, + "grad_norm": 2.2220332622528076, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.7177870869636536, + "num_tokens": 393092225.0, + "step": 15530 + }, + { + "epoch": 1.7055787392927741, + "grad_norm": 2.365633010864258, + "learning_rate": 1e-06, + "loss": 0.8683, + "mean_token_accuracy": 0.7266335487365723, + "num_tokens": 393115396.0, + "step": 15531 + }, + { + "epoch": 1.7056885569953877, + "grad_norm": 1.9885413646697998, + "learning_rate": 1e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.7089507579803467, + "num_tokens": 393148831.0, + "step": 15532 + }, + { + "epoch": 1.7057983746980012, + "grad_norm": 2.2735517024993896, + "learning_rate": 1e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7188660502433777, + "num_tokens": 393173856.0, + "step": 15533 + }, + { + "epoch": 1.705908192400615, + "grad_norm": 2.2129502296447754, + "learning_rate": 1e-06, + "loss": 0.9635, + "mean_token_accuracy": 0.7138952612876892, + "num_tokens": 393200367.0, + "step": 15534 + }, + { + "epoch": 1.7060180101032287, + "grad_norm": 2.378229856491089, + "learning_rate": 1e-06, + "loss": 0.8522, + "mean_token_accuracy": 0.7322417497634888, + "num_tokens": 393225000.0, + "step": 15535 + }, + { + "epoch": 1.7061278278058423, + "grad_norm": 2.084618330001831, + "learning_rate": 1e-06, + "loss": 1.0143, + "mean_token_accuracy": 0.6920901536941528, + "num_tokens": 393256972.0, + "step": 15536 + }, + { + "epoch": 1.7062376455084558, + "grad_norm": 2.548717975616455, + "learning_rate": 1e-06, + "loss": 0.8594, + "mean_token_accuracy": 0.7312408685684204, + "num_tokens": 393276849.0, + "step": 15537 + }, + { + "epoch": 1.7063474632110696, + "grad_norm": 2.3751983642578125, + "learning_rate": 1e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7235338687896729, + "num_tokens": 393300640.0, + "step": 15538 + }, + { + "epoch": 1.7064572809136833, + "grad_norm": 2.3047637939453125, + "learning_rate": 1e-06, + "loss": 0.8748, + "mean_token_accuracy": 0.7206963300704956, + "num_tokens": 393324568.0, + "step": 15539 + }, + { + "epoch": 1.706567098616297, + "grad_norm": 2.6026384830474854, + "learning_rate": 1e-06, + "loss": 0.8842, + "mean_token_accuracy": 0.7206765413284302, + "num_tokens": 393344742.0, + "step": 15540 + }, + { + "epoch": 1.7066769163189106, + "grad_norm": 2.033942699432373, + "learning_rate": 1e-06, + "loss": 0.8345, + "mean_token_accuracy": 0.7345085144042969, + "num_tokens": 393376353.0, + "step": 15541 + }, + { + "epoch": 1.7067867340215241, + "grad_norm": 2.108292579650879, + "learning_rate": 1e-06, + "loss": 0.887, + "mean_token_accuracy": 0.7222427725791931, + "num_tokens": 393406285.0, + "step": 15542 + }, + { + "epoch": 1.706896551724138, + "grad_norm": 2.4958043098449707, + "learning_rate": 1e-06, + "loss": 0.8505, + "mean_token_accuracy": 0.72857666015625, + "num_tokens": 393427067.0, + "step": 15543 + }, + { + "epoch": 1.7070063694267517, + "grad_norm": 2.346421003341675, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.7140992879867554, + "num_tokens": 393451978.0, + "step": 15544 + }, + { + "epoch": 1.7071161871293654, + "grad_norm": 2.490253210067749, + "learning_rate": 1e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.7212532758712769, + "num_tokens": 393474462.0, + "step": 15545 + }, + { + "epoch": 1.707226004831979, + "grad_norm": 2.381716251373291, + "learning_rate": 1e-06, + "loss": 0.8664, + "mean_token_accuracy": 0.7296547889709473, + "num_tokens": 393496314.0, + "step": 15546 + }, + { + "epoch": 1.7073358225345925, + "grad_norm": 2.028353452682495, + "learning_rate": 1e-06, + "loss": 0.8123, + "mean_token_accuracy": 0.7419167757034302, + "num_tokens": 393522166.0, + "step": 15547 + }, + { + "epoch": 1.7074456402372062, + "grad_norm": 2.2980804443359375, + "learning_rate": 1e-06, + "loss": 0.8404, + "mean_token_accuracy": 0.7290061116218567, + "num_tokens": 393546414.0, + "step": 15548 + }, + { + "epoch": 1.70755545793982, + "grad_norm": 2.1868317127227783, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.702048122882843, + "num_tokens": 393574214.0, + "step": 15549 + }, + { + "epoch": 1.7076652756424335, + "grad_norm": 2.437213897705078, + "learning_rate": 1e-06, + "loss": 0.841, + "mean_token_accuracy": 0.7397661209106445, + "num_tokens": 393594371.0, + "step": 15550 + }, + { + "epoch": 1.707775093345047, + "grad_norm": 2.709725856781006, + "learning_rate": 1e-06, + "loss": 0.8594, + "mean_token_accuracy": 0.7314275503158569, + "num_tokens": 393612941.0, + "step": 15551 + }, + { + "epoch": 1.7078849110476608, + "grad_norm": 2.1337690353393555, + "learning_rate": 1e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.7177761793136597, + "num_tokens": 393638662.0, + "step": 15552 + }, + { + "epoch": 1.7079947287502746, + "grad_norm": 2.1812093257904053, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7118997573852539, + "num_tokens": 393665409.0, + "step": 15553 + }, + { + "epoch": 1.7081045464528883, + "grad_norm": 2.4769463539123535, + "learning_rate": 1e-06, + "loss": 0.8655, + "mean_token_accuracy": 0.734775185585022, + "num_tokens": 393687516.0, + "step": 15554 + }, + { + "epoch": 1.7082143641555019, + "grad_norm": 2.394789218902588, + "learning_rate": 1e-06, + "loss": 0.8243, + "mean_token_accuracy": 0.7379686236381531, + "num_tokens": 393711802.0, + "step": 15555 + }, + { + "epoch": 1.7083241818581154, + "grad_norm": 2.4234731197357178, + "learning_rate": 1e-06, + "loss": 0.8644, + "mean_token_accuracy": 0.7273145318031311, + "num_tokens": 393733232.0, + "step": 15556 + }, + { + "epoch": 1.7084339995607292, + "grad_norm": 2.578221559524536, + "learning_rate": 1e-06, + "loss": 0.8, + "mean_token_accuracy": 0.7355585098266602, + "num_tokens": 393752058.0, + "step": 15557 + }, + { + "epoch": 1.708543817263343, + "grad_norm": 2.2672722339630127, + "learning_rate": 1e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.7031064033508301, + "num_tokens": 393776982.0, + "step": 15558 + }, + { + "epoch": 1.7086536349659565, + "grad_norm": 2.251128911972046, + "learning_rate": 1e-06, + "loss": 0.8892, + "mean_token_accuracy": 0.7180576324462891, + "num_tokens": 393803541.0, + "step": 15559 + }, + { + "epoch": 1.7087634526685702, + "grad_norm": 2.3081295490264893, + "learning_rate": 1e-06, + "loss": 0.8682, + "mean_token_accuracy": 0.7349334955215454, + "num_tokens": 393827483.0, + "step": 15560 + }, + { + "epoch": 1.7088732703711838, + "grad_norm": 2.2988295555114746, + "learning_rate": 1e-06, + "loss": 0.8907, + "mean_token_accuracy": 0.72188401222229, + "num_tokens": 393851915.0, + "step": 15561 + }, + { + "epoch": 1.7089830880737975, + "grad_norm": 2.4220805168151855, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.7185639142990112, + "num_tokens": 393875144.0, + "step": 15562 + }, + { + "epoch": 1.7090929057764113, + "grad_norm": 2.16062331199646, + "learning_rate": 1e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.7147263288497925, + "num_tokens": 393900533.0, + "step": 15563 + }, + { + "epoch": 1.7092027234790248, + "grad_norm": 1.9649449586868286, + "learning_rate": 1e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.7129716277122498, + "num_tokens": 393930201.0, + "step": 15564 + }, + { + "epoch": 1.7093125411816383, + "grad_norm": 2.185004711151123, + "learning_rate": 1e-06, + "loss": 0.8174, + "mean_token_accuracy": 0.7401169538497925, + "num_tokens": 393955277.0, + "step": 15565 + }, + { + "epoch": 1.709422358884252, + "grad_norm": 2.3466572761535645, + "learning_rate": 1e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.7295200824737549, + "num_tokens": 393979514.0, + "step": 15566 + }, + { + "epoch": 1.7095321765868658, + "grad_norm": 1.954207181930542, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.7122853994369507, + "num_tokens": 394011881.0, + "step": 15567 + }, + { + "epoch": 1.7096419942894796, + "grad_norm": 2.3303496837615967, + "learning_rate": 1e-06, + "loss": 0.9836, + "mean_token_accuracy": 0.6977910995483398, + "num_tokens": 394037832.0, + "step": 15568 + }, + { + "epoch": 1.7097518119920931, + "grad_norm": 2.1767332553863525, + "learning_rate": 1e-06, + "loss": 1.0059, + "mean_token_accuracy": 0.6873614192008972, + "num_tokens": 394064262.0, + "step": 15569 + }, + { + "epoch": 1.7098616296947067, + "grad_norm": 2.5394208431243896, + "learning_rate": 1e-06, + "loss": 0.8633, + "mean_token_accuracy": 0.730871856212616, + "num_tokens": 394086535.0, + "step": 15570 + }, + { + "epoch": 1.7099714473973204, + "grad_norm": 2.755965232849121, + "learning_rate": 1e-06, + "loss": 0.823, + "mean_token_accuracy": 0.7423396706581116, + "num_tokens": 394104126.0, + "step": 15571 + }, + { + "epoch": 1.7100812650999342, + "grad_norm": 2.121452808380127, + "learning_rate": 1e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.7051800489425659, + "num_tokens": 394132331.0, + "step": 15572 + }, + { + "epoch": 1.7101910828025477, + "grad_norm": 2.5743422508239746, + "learning_rate": 1e-06, + "loss": 0.8084, + "mean_token_accuracy": 0.7457996010780334, + "num_tokens": 394151505.0, + "step": 15573 + }, + { + "epoch": 1.7103009005051615, + "grad_norm": 2.303054094314575, + "learning_rate": 1e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.7157939672470093, + "num_tokens": 394176962.0, + "step": 15574 + }, + { + "epoch": 1.710410718207775, + "grad_norm": 2.7755918502807617, + "learning_rate": 1e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.7218501567840576, + "num_tokens": 394195536.0, + "step": 15575 + }, + { + "epoch": 1.7105205359103888, + "grad_norm": 2.524480104446411, + "learning_rate": 1e-06, + "loss": 0.8414, + "mean_token_accuracy": 0.7392534017562866, + "num_tokens": 394217665.0, + "step": 15576 + }, + { + "epoch": 1.7106303536130025, + "grad_norm": 2.1902060508728027, + "learning_rate": 1e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.7259624004364014, + "num_tokens": 394245021.0, + "step": 15577 + }, + { + "epoch": 1.710740171315616, + "grad_norm": 2.080183506011963, + "learning_rate": 1e-06, + "loss": 0.8832, + "mean_token_accuracy": 0.7243104577064514, + "num_tokens": 394273748.0, + "step": 15578 + }, + { + "epoch": 1.7108499890182296, + "grad_norm": 2.1114847660064697, + "learning_rate": 1e-06, + "loss": 0.8595, + "mean_token_accuracy": 0.7367058992385864, + "num_tokens": 394300470.0, + "step": 15579 + }, + { + "epoch": 1.7109598067208434, + "grad_norm": 2.2695348262786865, + "learning_rate": 1e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7220131754875183, + "num_tokens": 394326201.0, + "step": 15580 + }, + { + "epoch": 1.7110696244234571, + "grad_norm": 2.5447769165039062, + "learning_rate": 1e-06, + "loss": 0.8762, + "mean_token_accuracy": 0.730749249458313, + "num_tokens": 394348384.0, + "step": 15581 + }, + { + "epoch": 1.7111794421260709, + "grad_norm": 2.445502758026123, + "learning_rate": 1e-06, + "loss": 0.8606, + "mean_token_accuracy": 0.7317326664924622, + "num_tokens": 394370340.0, + "step": 15582 + }, + { + "epoch": 1.7112892598286844, + "grad_norm": 2.210766315460205, + "learning_rate": 1e-06, + "loss": 0.8285, + "mean_token_accuracy": 0.7298939228057861, + "num_tokens": 394395889.0, + "step": 15583 + }, + { + "epoch": 1.711399077531298, + "grad_norm": 2.228076934814453, + "learning_rate": 1e-06, + "loss": 0.8774, + "mean_token_accuracy": 0.7224030494689941, + "num_tokens": 394421663.0, + "step": 15584 + }, + { + "epoch": 1.7115088952339117, + "grad_norm": 2.3346312046051025, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7098637819290161, + "num_tokens": 394448433.0, + "step": 15585 + }, + { + "epoch": 1.7116187129365255, + "grad_norm": 2.005171298980713, + "learning_rate": 1e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7171469926834106, + "num_tokens": 394478004.0, + "step": 15586 + }, + { + "epoch": 1.711728530639139, + "grad_norm": 2.1265618801116943, + "learning_rate": 1e-06, + "loss": 0.8542, + "mean_token_accuracy": 0.7366330623626709, + "num_tokens": 394504282.0, + "step": 15587 + }, + { + "epoch": 1.7118383483417525, + "grad_norm": 2.064268112182617, + "learning_rate": 1e-06, + "loss": 0.7765, + "mean_token_accuracy": 0.7514630556106567, + "num_tokens": 394530374.0, + "step": 15588 + }, + { + "epoch": 1.7119481660443663, + "grad_norm": 2.379849433898926, + "learning_rate": 1e-06, + "loss": 1.0192, + "mean_token_accuracy": 0.6881251335144043, + "num_tokens": 394556311.0, + "step": 15589 + }, + { + "epoch": 1.71205798374698, + "grad_norm": 2.0792529582977295, + "learning_rate": 1e-06, + "loss": 0.9366, + "mean_token_accuracy": 0.7207721471786499, + "num_tokens": 394587603.0, + "step": 15590 + }, + { + "epoch": 1.7121678014495938, + "grad_norm": 2.268517255783081, + "learning_rate": 1e-06, + "loss": 0.765, + "mean_token_accuracy": 0.7531070709228516, + "num_tokens": 394612032.0, + "step": 15591 + }, + { + "epoch": 1.7122776191522073, + "grad_norm": 2.1083436012268066, + "learning_rate": 1e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.7180120944976807, + "num_tokens": 394638697.0, + "step": 15592 + }, + { + "epoch": 1.7123874368548209, + "grad_norm": 2.4384846687316895, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.7133569717407227, + "num_tokens": 394661744.0, + "step": 15593 + }, + { + "epoch": 1.7124972545574346, + "grad_norm": 2.279059648513794, + "learning_rate": 1e-06, + "loss": 0.839, + "mean_token_accuracy": 0.7371299266815186, + "num_tokens": 394685046.0, + "step": 15594 + }, + { + "epoch": 1.7126070722600484, + "grad_norm": 2.5866169929504395, + "learning_rate": 1e-06, + "loss": 0.8354, + "mean_token_accuracy": 0.7345849275588989, + "num_tokens": 394706004.0, + "step": 15595 + }, + { + "epoch": 1.7127168899626621, + "grad_norm": 2.060783624649048, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.713447630405426, + "num_tokens": 394734315.0, + "step": 15596 + }, + { + "epoch": 1.7128267076652757, + "grad_norm": 2.0375993251800537, + "learning_rate": 1e-06, + "loss": 0.9172, + "mean_token_accuracy": 0.7267639636993408, + "num_tokens": 394764076.0, + "step": 15597 + }, + { + "epoch": 1.7129365253678892, + "grad_norm": 2.1933984756469727, + "learning_rate": 1e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7085498571395874, + "num_tokens": 394793331.0, + "step": 15598 + }, + { + "epoch": 1.713046343070503, + "grad_norm": 2.372784376144409, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7019587159156799, + "num_tokens": 394817223.0, + "step": 15599 + }, + { + "epoch": 1.7131561607731167, + "grad_norm": 2.1590287685394287, + "learning_rate": 1e-06, + "loss": 0.8483, + "mean_token_accuracy": 0.7372705936431885, + "num_tokens": 394845208.0, + "step": 15600 + }, + { + "epoch": 1.7132659784757303, + "grad_norm": 2.484785795211792, + "learning_rate": 1e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.7072579860687256, + "num_tokens": 394868757.0, + "step": 15601 + }, + { + "epoch": 1.7133757961783438, + "grad_norm": 2.254533290863037, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7194632291793823, + "num_tokens": 394895001.0, + "step": 15602 + }, + { + "epoch": 1.7134856138809575, + "grad_norm": 2.21431040763855, + "learning_rate": 1e-06, + "loss": 0.9054, + "mean_token_accuracy": 0.7168916463851929, + "num_tokens": 394921130.0, + "step": 15603 + }, + { + "epoch": 1.7135954315835713, + "grad_norm": 2.115562677383423, + "learning_rate": 1e-06, + "loss": 0.8926, + "mean_token_accuracy": 0.72603440284729, + "num_tokens": 394952629.0, + "step": 15604 + }, + { + "epoch": 1.713705249286185, + "grad_norm": 1.8032314777374268, + "learning_rate": 1e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.6988817453384399, + "num_tokens": 394991869.0, + "step": 15605 + }, + { + "epoch": 1.7138150669887986, + "grad_norm": 2.3053460121154785, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7106336355209351, + "num_tokens": 395017090.0, + "step": 15606 + }, + { + "epoch": 1.7139248846914121, + "grad_norm": 2.1423394680023193, + "learning_rate": 1e-06, + "loss": 0.861, + "mean_token_accuracy": 0.7257093787193298, + "num_tokens": 395043846.0, + "step": 15607 + }, + { + "epoch": 1.7140347023940259, + "grad_norm": 2.0921638011932373, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7282795906066895, + "num_tokens": 395069979.0, + "step": 15608 + }, + { + "epoch": 1.7141445200966396, + "grad_norm": 2.3085076808929443, + "learning_rate": 1e-06, + "loss": 1.022, + "mean_token_accuracy": 0.6870349645614624, + "num_tokens": 395096053.0, + "step": 15609 + }, + { + "epoch": 1.7142543377992534, + "grad_norm": 2.2536275386810303, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.7108339071273804, + "num_tokens": 395122455.0, + "step": 15610 + }, + { + "epoch": 1.714364155501867, + "grad_norm": 2.4022927284240723, + "learning_rate": 1e-06, + "loss": 0.8544, + "mean_token_accuracy": 0.7275711894035339, + "num_tokens": 395145454.0, + "step": 15611 + }, + { + "epoch": 1.7144739732044805, + "grad_norm": 2.5430006980895996, + "learning_rate": 1e-06, + "loss": 1.0084, + "mean_token_accuracy": 0.6923370957374573, + "num_tokens": 395167038.0, + "step": 15612 + }, + { + "epoch": 1.7145837909070942, + "grad_norm": 2.228717803955078, + "learning_rate": 1e-06, + "loss": 0.9011, + "mean_token_accuracy": 0.710990846157074, + "num_tokens": 395193759.0, + "step": 15613 + }, + { + "epoch": 1.714693608609708, + "grad_norm": 2.3225502967834473, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.7124004364013672, + "num_tokens": 395220103.0, + "step": 15614 + }, + { + "epoch": 1.7148034263123215, + "grad_norm": 2.2343363761901855, + "learning_rate": 1e-06, + "loss": 0.9193, + "mean_token_accuracy": 0.7153447866439819, + "num_tokens": 395247808.0, + "step": 15615 + }, + { + "epoch": 1.714913244014935, + "grad_norm": 2.37444806098938, + "learning_rate": 1e-06, + "loss": 0.7914, + "mean_token_accuracy": 0.7433856129646301, + "num_tokens": 395269741.0, + "step": 15616 + }, + { + "epoch": 1.7150230617175488, + "grad_norm": 2.124746084213257, + "learning_rate": 1e-06, + "loss": 0.9025, + "mean_token_accuracy": 0.7176733613014221, + "num_tokens": 395299309.0, + "step": 15617 + }, + { + "epoch": 1.7151328794201626, + "grad_norm": 2.0376484394073486, + "learning_rate": 1e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.704456090927124, + "num_tokens": 395328954.0, + "step": 15618 + }, + { + "epoch": 1.7152426971227763, + "grad_norm": 2.487480640411377, + "learning_rate": 1e-06, + "loss": 0.8777, + "mean_token_accuracy": 0.7389347553253174, + "num_tokens": 395352635.0, + "step": 15619 + }, + { + "epoch": 1.7153525148253899, + "grad_norm": 2.333002805709839, + "learning_rate": 1e-06, + "loss": 0.8152, + "mean_token_accuracy": 0.7370985150337219, + "num_tokens": 395377076.0, + "step": 15620 + }, + { + "epoch": 1.7154623325280034, + "grad_norm": 2.501403570175171, + "learning_rate": 1e-06, + "loss": 0.8895, + "mean_token_accuracy": 0.724178671836853, + "num_tokens": 395398085.0, + "step": 15621 + }, + { + "epoch": 1.7155721502306172, + "grad_norm": 1.9638433456420898, + "learning_rate": 1e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.702958881855011, + "num_tokens": 395431791.0, + "step": 15622 + }, + { + "epoch": 1.715681967933231, + "grad_norm": 2.1380088329315186, + "learning_rate": 1e-06, + "loss": 0.8471, + "mean_token_accuracy": 0.7301392555236816, + "num_tokens": 395458229.0, + "step": 15623 + }, + { + "epoch": 1.7157917856358444, + "grad_norm": 2.0760974884033203, + "learning_rate": 1e-06, + "loss": 0.8721, + "mean_token_accuracy": 0.7265958189964294, + "num_tokens": 395487318.0, + "step": 15624 + }, + { + "epoch": 1.7159016033384582, + "grad_norm": 1.9812289476394653, + "learning_rate": 1e-06, + "loss": 0.877, + "mean_token_accuracy": 0.7324827909469604, + "num_tokens": 395515337.0, + "step": 15625 + }, + { + "epoch": 1.7160114210410717, + "grad_norm": 2.3188397884368896, + "learning_rate": 1e-06, + "loss": 0.9105, + "mean_token_accuracy": 0.7122440934181213, + "num_tokens": 395539834.0, + "step": 15626 + }, + { + "epoch": 1.7161212387436855, + "grad_norm": 2.6868255138397217, + "learning_rate": 1e-06, + "loss": 0.907, + "mean_token_accuracy": 0.7264828681945801, + "num_tokens": 395559791.0, + "step": 15627 + }, + { + "epoch": 1.7162310564462993, + "grad_norm": 2.506160259246826, + "learning_rate": 1e-06, + "loss": 0.904, + "mean_token_accuracy": 0.7212867736816406, + "num_tokens": 395582410.0, + "step": 15628 + }, + { + "epoch": 1.7163408741489128, + "grad_norm": 2.513061046600342, + "learning_rate": 1e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.7302753329277039, + "num_tokens": 395604806.0, + "step": 15629 + }, + { + "epoch": 1.7164506918515263, + "grad_norm": 2.2437868118286133, + "learning_rate": 1e-06, + "loss": 0.9279, + "mean_token_accuracy": 0.7149021029472351, + "num_tokens": 395631740.0, + "step": 15630 + }, + { + "epoch": 1.71656050955414, + "grad_norm": 2.6681671142578125, + "learning_rate": 1e-06, + "loss": 0.8636, + "mean_token_accuracy": 0.732735812664032, + "num_tokens": 395652290.0, + "step": 15631 + }, + { + "epoch": 1.7166703272567538, + "grad_norm": 1.9822009801864624, + "learning_rate": 1e-06, + "loss": 0.8928, + "mean_token_accuracy": 0.7171143293380737, + "num_tokens": 395680819.0, + "step": 15632 + }, + { + "epoch": 1.7167801449593676, + "grad_norm": 2.1049599647521973, + "learning_rate": 1e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.7050860524177551, + "num_tokens": 395709790.0, + "step": 15633 + }, + { + "epoch": 1.7168899626619811, + "grad_norm": 2.341850757598877, + "learning_rate": 1e-06, + "loss": 0.8162, + "mean_token_accuracy": 0.7360297441482544, + "num_tokens": 395732592.0, + "step": 15634 + }, + { + "epoch": 1.7169997803645947, + "grad_norm": 2.2384555339813232, + "learning_rate": 1e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.7149598598480225, + "num_tokens": 395758049.0, + "step": 15635 + }, + { + "epoch": 1.7171095980672084, + "grad_norm": 2.111353635787964, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.700556755065918, + "num_tokens": 395785833.0, + "step": 15636 + }, + { + "epoch": 1.7172194157698222, + "grad_norm": 2.162010431289673, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7212809324264526, + "num_tokens": 395812934.0, + "step": 15637 + }, + { + "epoch": 1.7173292334724357, + "grad_norm": 2.617238759994507, + "learning_rate": 1e-06, + "loss": 0.874, + "mean_token_accuracy": 0.7359481453895569, + "num_tokens": 395831732.0, + "step": 15638 + }, + { + "epoch": 1.7174390511750495, + "grad_norm": 2.187730312347412, + "learning_rate": 1e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.7011555433273315, + "num_tokens": 395858439.0, + "step": 15639 + }, + { + "epoch": 1.717548868877663, + "grad_norm": 2.0173232555389404, + "learning_rate": 1e-06, + "loss": 0.994, + "mean_token_accuracy": 0.6890883445739746, + "num_tokens": 395888329.0, + "step": 15640 + }, + { + "epoch": 1.7176586865802768, + "grad_norm": 2.393385171890259, + "learning_rate": 1e-06, + "loss": 0.872, + "mean_token_accuracy": 0.7282322645187378, + "num_tokens": 395912045.0, + "step": 15641 + }, + { + "epoch": 1.7177685042828905, + "grad_norm": 2.111124277114868, + "learning_rate": 1e-06, + "loss": 0.8982, + "mean_token_accuracy": 0.7208833694458008, + "num_tokens": 395940408.0, + "step": 15642 + }, + { + "epoch": 1.717878321985504, + "grad_norm": 2.160769462585449, + "learning_rate": 1e-06, + "loss": 0.8922, + "mean_token_accuracy": 0.7280902862548828, + "num_tokens": 395968063.0, + "step": 15643 + }, + { + "epoch": 1.7179881396881176, + "grad_norm": 2.4770877361297607, + "learning_rate": 1e-06, + "loss": 0.8917, + "mean_token_accuracy": 0.7221246957778931, + "num_tokens": 395989389.0, + "step": 15644 + }, + { + "epoch": 1.7180979573907313, + "grad_norm": 2.005896806716919, + "learning_rate": 1e-06, + "loss": 0.9511, + "mean_token_accuracy": 0.6995980739593506, + "num_tokens": 396020079.0, + "step": 15645 + }, + { + "epoch": 1.718207775093345, + "grad_norm": 1.9184991121292114, + "learning_rate": 1e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.7188575267791748, + "num_tokens": 396053691.0, + "step": 15646 + }, + { + "epoch": 1.7183175927959589, + "grad_norm": 2.654592990875244, + "learning_rate": 1e-06, + "loss": 0.8234, + "mean_token_accuracy": 0.7352582216262817, + "num_tokens": 396073404.0, + "step": 15647 + }, + { + "epoch": 1.7184274104985724, + "grad_norm": 2.3488142490386963, + "learning_rate": 1e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7147011756896973, + "num_tokens": 396096617.0, + "step": 15648 + }, + { + "epoch": 1.718537228201186, + "grad_norm": 2.53997540473938, + "learning_rate": 1e-06, + "loss": 0.7806, + "mean_token_accuracy": 0.7511147260665894, + "num_tokens": 396116478.0, + "step": 15649 + }, + { + "epoch": 1.7186470459037997, + "grad_norm": 2.349276065826416, + "learning_rate": 1e-06, + "loss": 0.8311, + "mean_token_accuracy": 0.7544572353363037, + "num_tokens": 396139199.0, + "step": 15650 + }, + { + "epoch": 1.7187568636064134, + "grad_norm": 2.3450369834899902, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7115808725357056, + "num_tokens": 396164278.0, + "step": 15651 + }, + { + "epoch": 1.718866681309027, + "grad_norm": 2.3656811714172363, + "learning_rate": 1e-06, + "loss": 0.883, + "mean_token_accuracy": 0.71867835521698, + "num_tokens": 396186894.0, + "step": 15652 + }, + { + "epoch": 1.7189764990116405, + "grad_norm": 2.2628684043884277, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.706796407699585, + "num_tokens": 396211848.0, + "step": 15653 + }, + { + "epoch": 1.7190863167142543, + "grad_norm": 2.3852295875549316, + "learning_rate": 1e-06, + "loss": 0.8624, + "mean_token_accuracy": 0.720885157585144, + "num_tokens": 396233688.0, + "step": 15654 + }, + { + "epoch": 1.719196134416868, + "grad_norm": 2.1268632411956787, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7189058661460876, + "num_tokens": 396259761.0, + "step": 15655 + }, + { + "epoch": 1.7193059521194818, + "grad_norm": 2.185800075531006, + "learning_rate": 1e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.7020053863525391, + "num_tokens": 396287727.0, + "step": 15656 + }, + { + "epoch": 1.7194157698220953, + "grad_norm": 2.3567821979522705, + "learning_rate": 1e-06, + "loss": 0.8959, + "mean_token_accuracy": 0.720633327960968, + "num_tokens": 396311762.0, + "step": 15657 + }, + { + "epoch": 1.7195255875247089, + "grad_norm": 2.2361013889312744, + "learning_rate": 1e-06, + "loss": 0.8989, + "mean_token_accuracy": 0.7202160954475403, + "num_tokens": 396338107.0, + "step": 15658 + }, + { + "epoch": 1.7196354052273226, + "grad_norm": 2.125593900680542, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7120857238769531, + "num_tokens": 396367198.0, + "step": 15659 + }, + { + "epoch": 1.7197452229299364, + "grad_norm": 2.10539174079895, + "learning_rate": 1e-06, + "loss": 0.9386, + "mean_token_accuracy": 0.705875039100647, + "num_tokens": 396395686.0, + "step": 15660 + }, + { + "epoch": 1.7198550406325501, + "grad_norm": 2.413296937942505, + "learning_rate": 1e-06, + "loss": 0.8924, + "mean_token_accuracy": 0.720937192440033, + "num_tokens": 396418788.0, + "step": 15661 + }, + { + "epoch": 1.7199648583351637, + "grad_norm": 2.6870546340942383, + "learning_rate": 1e-06, + "loss": 0.8639, + "mean_token_accuracy": 0.7339046001434326, + "num_tokens": 396437878.0, + "step": 15662 + }, + { + "epoch": 1.7200746760377772, + "grad_norm": 2.3454277515411377, + "learning_rate": 1e-06, + "loss": 0.8372, + "mean_token_accuracy": 0.7301986217498779, + "num_tokens": 396461306.0, + "step": 15663 + }, + { + "epoch": 1.720184493740391, + "grad_norm": 2.100985527038574, + "learning_rate": 1e-06, + "loss": 1.0662, + "mean_token_accuracy": 0.6981490850448608, + "num_tokens": 396492095.0, + "step": 15664 + }, + { + "epoch": 1.7202943114430047, + "grad_norm": 2.3285653591156006, + "learning_rate": 1e-06, + "loss": 0.94, + "mean_token_accuracy": 0.7157150506973267, + "num_tokens": 396515419.0, + "step": 15665 + }, + { + "epoch": 1.7204041291456182, + "grad_norm": 2.296861171722412, + "learning_rate": 1e-06, + "loss": 0.961, + "mean_token_accuracy": 0.7046905755996704, + "num_tokens": 396540102.0, + "step": 15666 + }, + { + "epoch": 1.7205139468482318, + "grad_norm": 1.8897652626037598, + "learning_rate": 1e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.6944000124931335, + "num_tokens": 396573916.0, + "step": 15667 + }, + { + "epoch": 1.7206237645508455, + "grad_norm": 2.6026999950408936, + "learning_rate": 1e-06, + "loss": 0.8622, + "mean_token_accuracy": 0.7261831760406494, + "num_tokens": 396595021.0, + "step": 15668 + }, + { + "epoch": 1.7207335822534593, + "grad_norm": 1.9740084409713745, + "learning_rate": 1e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.712314248085022, + "num_tokens": 396628699.0, + "step": 15669 + }, + { + "epoch": 1.720843399956073, + "grad_norm": 2.2724194526672363, + "learning_rate": 1e-06, + "loss": 0.8737, + "mean_token_accuracy": 0.7243360280990601, + "num_tokens": 396653034.0, + "step": 15670 + }, + { + "epoch": 1.7209532176586866, + "grad_norm": 2.3596699237823486, + "learning_rate": 1e-06, + "loss": 0.8816, + "mean_token_accuracy": 0.7155095338821411, + "num_tokens": 396676950.0, + "step": 15671 + }, + { + "epoch": 1.7210630353613001, + "grad_norm": 1.9845097064971924, + "learning_rate": 1e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.7109565138816833, + "num_tokens": 396709225.0, + "step": 15672 + }, + { + "epoch": 1.7211728530639139, + "grad_norm": 2.251646041870117, + "learning_rate": 1e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.7138208746910095, + "num_tokens": 396735512.0, + "step": 15673 + }, + { + "epoch": 1.7212826707665276, + "grad_norm": 2.2464656829833984, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7080141305923462, + "num_tokens": 396761892.0, + "step": 15674 + }, + { + "epoch": 1.7213924884691414, + "grad_norm": 2.1698877811431885, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.6991485357284546, + "num_tokens": 396791355.0, + "step": 15675 + }, + { + "epoch": 1.721502306171755, + "grad_norm": 2.1796324253082275, + "learning_rate": 1e-06, + "loss": 0.8647, + "mean_token_accuracy": 0.7309744358062744, + "num_tokens": 396819329.0, + "step": 15676 + }, + { + "epoch": 1.7216121238743685, + "grad_norm": 2.2312114238739014, + "learning_rate": 1e-06, + "loss": 0.7994, + "mean_token_accuracy": 0.7430939674377441, + "num_tokens": 396844571.0, + "step": 15677 + }, + { + "epoch": 1.7217219415769822, + "grad_norm": 2.4118144512176514, + "learning_rate": 1e-06, + "loss": 0.8915, + "mean_token_accuracy": 0.7225459814071655, + "num_tokens": 396865264.0, + "step": 15678 + }, + { + "epoch": 1.721831759279596, + "grad_norm": 2.4532527923583984, + "learning_rate": 1e-06, + "loss": 0.8354, + "mean_token_accuracy": 0.7403472661972046, + "num_tokens": 396885987.0, + "step": 15679 + }, + { + "epoch": 1.7219415769822095, + "grad_norm": 2.1883957386016846, + "learning_rate": 1e-06, + "loss": 0.8812, + "mean_token_accuracy": 0.7227412462234497, + "num_tokens": 396912085.0, + "step": 15680 + }, + { + "epoch": 1.722051394684823, + "grad_norm": 2.0405702590942383, + "learning_rate": 1e-06, + "loss": 0.9838, + "mean_token_accuracy": 0.7066865563392639, + "num_tokens": 396944299.0, + "step": 15681 + }, + { + "epoch": 1.7221612123874368, + "grad_norm": 2.4608893394470215, + "learning_rate": 1e-06, + "loss": 0.7721, + "mean_token_accuracy": 0.7528197765350342, + "num_tokens": 396964313.0, + "step": 15682 + }, + { + "epoch": 1.7222710300900506, + "grad_norm": 2.168292999267578, + "learning_rate": 1e-06, + "loss": 0.9054, + "mean_token_accuracy": 0.7131402492523193, + "num_tokens": 396992985.0, + "step": 15683 + }, + { + "epoch": 1.7223808477926643, + "grad_norm": 2.1795153617858887, + "learning_rate": 1e-06, + "loss": 0.9215, + "mean_token_accuracy": 0.7144843339920044, + "num_tokens": 397019840.0, + "step": 15684 + }, + { + "epoch": 1.7224906654952779, + "grad_norm": 2.382333517074585, + "learning_rate": 1e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.7137659788131714, + "num_tokens": 397042732.0, + "step": 15685 + }, + { + "epoch": 1.7226004831978914, + "grad_norm": 2.4454171657562256, + "learning_rate": 1e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.6998332142829895, + "num_tokens": 397066285.0, + "step": 15686 + }, + { + "epoch": 1.7227103009005051, + "grad_norm": 2.5829567909240723, + "learning_rate": 1e-06, + "loss": 0.8886, + "mean_token_accuracy": 0.7337327003479004, + "num_tokens": 397087058.0, + "step": 15687 + }, + { + "epoch": 1.722820118603119, + "grad_norm": 2.178394317626953, + "learning_rate": 1e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.7134556770324707, + "num_tokens": 397112832.0, + "step": 15688 + }, + { + "epoch": 1.7229299363057324, + "grad_norm": 2.1611487865448, + "learning_rate": 1e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7166925668716431, + "num_tokens": 397140850.0, + "step": 15689 + }, + { + "epoch": 1.7230397540083462, + "grad_norm": 2.2489724159240723, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7176084518432617, + "num_tokens": 397167498.0, + "step": 15690 + }, + { + "epoch": 1.7231495717109597, + "grad_norm": 1.944057822227478, + "learning_rate": 1e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.7190003395080566, + "num_tokens": 397197379.0, + "step": 15691 + }, + { + "epoch": 1.7232593894135735, + "grad_norm": 2.575397491455078, + "learning_rate": 1e-06, + "loss": 0.8204, + "mean_token_accuracy": 0.7411789894104004, + "num_tokens": 397218758.0, + "step": 15692 + }, + { + "epoch": 1.7233692071161872, + "grad_norm": 2.329493522644043, + "learning_rate": 1e-06, + "loss": 0.8394, + "mean_token_accuracy": 0.734192967414856, + "num_tokens": 397242930.0, + "step": 15693 + }, + { + "epoch": 1.7234790248188008, + "grad_norm": 2.303884983062744, + "learning_rate": 1e-06, + "loss": 0.9191, + "mean_token_accuracy": 0.7090795040130615, + "num_tokens": 397268381.0, + "step": 15694 + }, + { + "epoch": 1.7235888425214143, + "grad_norm": 1.9629532098770142, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.6979435086250305, + "num_tokens": 397300209.0, + "step": 15695 + }, + { + "epoch": 1.723698660224028, + "grad_norm": 2.3231968879699707, + "learning_rate": 1e-06, + "loss": 0.8784, + "mean_token_accuracy": 0.7212103009223938, + "num_tokens": 397325736.0, + "step": 15696 + }, + { + "epoch": 1.7238084779266418, + "grad_norm": 2.2761638164520264, + "learning_rate": 1e-06, + "loss": 0.9268, + "mean_token_accuracy": 0.7108675241470337, + "num_tokens": 397350757.0, + "step": 15697 + }, + { + "epoch": 1.7239182956292556, + "grad_norm": 2.301759719848633, + "learning_rate": 1e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.7236030101776123, + "num_tokens": 397376487.0, + "step": 15698 + }, + { + "epoch": 1.7240281133318691, + "grad_norm": 2.0929534435272217, + "learning_rate": 1e-06, + "loss": 0.8537, + "mean_token_accuracy": 0.7316830158233643, + "num_tokens": 397403261.0, + "step": 15699 + }, + { + "epoch": 1.7241379310344827, + "grad_norm": 2.168046712875366, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7174831032752991, + "num_tokens": 397429982.0, + "step": 15700 + }, + { + "epoch": 1.7242477487370964, + "grad_norm": 2.0638341903686523, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7122155427932739, + "num_tokens": 397459037.0, + "step": 15701 + }, + { + "epoch": 1.7243575664397102, + "grad_norm": 2.0153887271881104, + "learning_rate": 1e-06, + "loss": 0.9629, + "mean_token_accuracy": 0.7017167806625366, + "num_tokens": 397489448.0, + "step": 15702 + }, + { + "epoch": 1.7244673841423237, + "grad_norm": 2.6152095794677734, + "learning_rate": 1e-06, + "loss": 0.7868, + "mean_token_accuracy": 0.7429839372634888, + "num_tokens": 397508452.0, + "step": 15703 + }, + { + "epoch": 1.7245772018449375, + "grad_norm": 2.499659538269043, + "learning_rate": 1e-06, + "loss": 0.8064, + "mean_token_accuracy": 0.7418439984321594, + "num_tokens": 397529616.0, + "step": 15704 + }, + { + "epoch": 1.724687019547551, + "grad_norm": 2.514355421066284, + "learning_rate": 1e-06, + "loss": 0.8552, + "mean_token_accuracy": 0.7370731830596924, + "num_tokens": 397549722.0, + "step": 15705 + }, + { + "epoch": 1.7247968372501647, + "grad_norm": 2.3239433765411377, + "learning_rate": 1e-06, + "loss": 1.009, + "mean_token_accuracy": 0.7025724649429321, + "num_tokens": 397575626.0, + "step": 15706 + }, + { + "epoch": 1.7249066549527785, + "grad_norm": 2.704146385192871, + "learning_rate": 1e-06, + "loss": 0.874, + "mean_token_accuracy": 0.7229795455932617, + "num_tokens": 397595134.0, + "step": 15707 + }, + { + "epoch": 1.725016472655392, + "grad_norm": 2.267395496368408, + "learning_rate": 1e-06, + "loss": 0.8051, + "mean_token_accuracy": 0.7449988722801208, + "num_tokens": 397619787.0, + "step": 15708 + }, + { + "epoch": 1.7251262903580056, + "grad_norm": 2.4089767932891846, + "learning_rate": 1e-06, + "loss": 0.8477, + "mean_token_accuracy": 0.7363595366477966, + "num_tokens": 397642740.0, + "step": 15709 + }, + { + "epoch": 1.7252361080606193, + "grad_norm": 2.3995718955993652, + "learning_rate": 1e-06, + "loss": 0.9237, + "mean_token_accuracy": 0.7099059224128723, + "num_tokens": 397664941.0, + "step": 15710 + }, + { + "epoch": 1.725345925763233, + "grad_norm": 2.2285287380218506, + "learning_rate": 1e-06, + "loss": 0.8875, + "mean_token_accuracy": 0.7262594699859619, + "num_tokens": 397690568.0, + "step": 15711 + }, + { + "epoch": 1.7254557434658468, + "grad_norm": 2.4685070514678955, + "learning_rate": 1e-06, + "loss": 0.7833, + "mean_token_accuracy": 0.7475090026855469, + "num_tokens": 397711373.0, + "step": 15712 + }, + { + "epoch": 1.7255655611684604, + "grad_norm": 2.6598527431488037, + "learning_rate": 1e-06, + "loss": 0.892, + "mean_token_accuracy": 0.7213007807731628, + "num_tokens": 397731227.0, + "step": 15713 + }, + { + "epoch": 1.725675378871074, + "grad_norm": 2.2202165126800537, + "learning_rate": 1e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.7098584771156311, + "num_tokens": 397757671.0, + "step": 15714 + }, + { + "epoch": 1.7257851965736877, + "grad_norm": 2.4798672199249268, + "learning_rate": 1e-06, + "loss": 0.8661, + "mean_token_accuracy": 0.7264372110366821, + "num_tokens": 397778684.0, + "step": 15715 + }, + { + "epoch": 1.7258950142763014, + "grad_norm": 2.319406747817993, + "learning_rate": 1e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.7160008549690247, + "num_tokens": 397802944.0, + "step": 15716 + }, + { + "epoch": 1.726004831978915, + "grad_norm": 2.6164350509643555, + "learning_rate": 1e-06, + "loss": 0.7696, + "mean_token_accuracy": 0.7563395500183105, + "num_tokens": 397822525.0, + "step": 15717 + }, + { + "epoch": 1.7261146496815285, + "grad_norm": 2.3530917167663574, + "learning_rate": 1e-06, + "loss": 0.7879, + "mean_token_accuracy": 0.748808741569519, + "num_tokens": 397844244.0, + "step": 15718 + }, + { + "epoch": 1.7262244673841423, + "grad_norm": 2.2295360565185547, + "learning_rate": 1e-06, + "loss": 0.892, + "mean_token_accuracy": 0.722070574760437, + "num_tokens": 397868279.0, + "step": 15719 + }, + { + "epoch": 1.726334285086756, + "grad_norm": 2.173931121826172, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7101256847381592, + "num_tokens": 397894927.0, + "step": 15720 + }, + { + "epoch": 1.7264441027893698, + "grad_norm": 2.0869650840759277, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7160020470619202, + "num_tokens": 397924186.0, + "step": 15721 + }, + { + "epoch": 1.7265539204919833, + "grad_norm": 2.46480393409729, + "learning_rate": 1e-06, + "loss": 0.8862, + "mean_token_accuracy": 0.7248658537864685, + "num_tokens": 397946613.0, + "step": 15722 + }, + { + "epoch": 1.7266637381945968, + "grad_norm": 2.2052783966064453, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7120208740234375, + "num_tokens": 397975436.0, + "step": 15723 + }, + { + "epoch": 1.7267735558972106, + "grad_norm": 2.380885362625122, + "learning_rate": 1e-06, + "loss": 0.8414, + "mean_token_accuracy": 0.741176962852478, + "num_tokens": 397997741.0, + "step": 15724 + }, + { + "epoch": 1.7268833735998244, + "grad_norm": 2.3881428241729736, + "learning_rate": 1e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.7126005291938782, + "num_tokens": 398022017.0, + "step": 15725 + }, + { + "epoch": 1.7269931913024381, + "grad_norm": 2.0869710445404053, + "learning_rate": 1e-06, + "loss": 0.8546, + "mean_token_accuracy": 0.7308162450790405, + "num_tokens": 398049180.0, + "step": 15726 + }, + { + "epoch": 1.7271030090050516, + "grad_norm": 2.404926300048828, + "learning_rate": 1e-06, + "loss": 0.8789, + "mean_token_accuracy": 0.7355607748031616, + "num_tokens": 398071760.0, + "step": 15727 + }, + { + "epoch": 1.7272128267076652, + "grad_norm": 2.1415789127349854, + "learning_rate": 1e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.719931960105896, + "num_tokens": 398100239.0, + "step": 15728 + }, + { + "epoch": 1.727322644410279, + "grad_norm": 2.307352066040039, + "learning_rate": 1e-06, + "loss": 0.9016, + "mean_token_accuracy": 0.7210356593132019, + "num_tokens": 398124392.0, + "step": 15729 + }, + { + "epoch": 1.7274324621128927, + "grad_norm": 2.3539562225341797, + "learning_rate": 1e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.7242025136947632, + "num_tokens": 398148611.0, + "step": 15730 + }, + { + "epoch": 1.7275422798155062, + "grad_norm": 2.591426134109497, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7150556445121765, + "num_tokens": 398169016.0, + "step": 15731 + }, + { + "epoch": 1.7276520975181198, + "grad_norm": 2.218550205230713, + "learning_rate": 1e-06, + "loss": 0.964, + "mean_token_accuracy": 0.7020296454429626, + "num_tokens": 398197066.0, + "step": 15732 + }, + { + "epoch": 1.7277619152207335, + "grad_norm": 2.057858467102051, + "learning_rate": 1e-06, + "loss": 1.0091, + "mean_token_accuracy": 0.6948556303977966, + "num_tokens": 398226761.0, + "step": 15733 + }, + { + "epoch": 1.7278717329233473, + "grad_norm": 2.056614398956299, + "learning_rate": 1e-06, + "loss": 0.9008, + "mean_token_accuracy": 0.7267888188362122, + "num_tokens": 398255678.0, + "step": 15734 + }, + { + "epoch": 1.727981550625961, + "grad_norm": 2.281942129135132, + "learning_rate": 1e-06, + "loss": 0.8749, + "mean_token_accuracy": 0.7224409580230713, + "num_tokens": 398279539.0, + "step": 15735 + }, + { + "epoch": 1.7280913683285746, + "grad_norm": 2.173895835876465, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7254419326782227, + "num_tokens": 398307314.0, + "step": 15736 + }, + { + "epoch": 1.728201186031188, + "grad_norm": 2.104107141494751, + "learning_rate": 1e-06, + "loss": 1.0363, + "mean_token_accuracy": 0.6870942115783691, + "num_tokens": 398338318.0, + "step": 15737 + }, + { + "epoch": 1.7283110037338019, + "grad_norm": 2.1316962242126465, + "learning_rate": 1e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.7005301117897034, + "num_tokens": 398368766.0, + "step": 15738 + }, + { + "epoch": 1.7284208214364156, + "grad_norm": 2.5062034130096436, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.6950275897979736, + "num_tokens": 398395448.0, + "step": 15739 + }, + { + "epoch": 1.7285306391390292, + "grad_norm": 2.2896368503570557, + "learning_rate": 1e-06, + "loss": 0.9103, + "mean_token_accuracy": 0.7146921157836914, + "num_tokens": 398420287.0, + "step": 15740 + }, + { + "epoch": 1.728640456841643, + "grad_norm": 2.141613006591797, + "learning_rate": 1e-06, + "loss": 0.912, + "mean_token_accuracy": 0.7139067649841309, + "num_tokens": 398448555.0, + "step": 15741 + }, + { + "epoch": 1.7287502745442564, + "grad_norm": 2.400498390197754, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.6962984800338745, + "num_tokens": 398472211.0, + "step": 15742 + }, + { + "epoch": 1.7288600922468702, + "grad_norm": 1.9069055318832397, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.6989446878433228, + "num_tokens": 398505274.0, + "step": 15743 + }, + { + "epoch": 1.728969909949484, + "grad_norm": 1.986444115638733, + "learning_rate": 1e-06, + "loss": 0.9926, + "mean_token_accuracy": 0.6947133541107178, + "num_tokens": 398539019.0, + "step": 15744 + }, + { + "epoch": 1.7290797276520975, + "grad_norm": 2.0889265537261963, + "learning_rate": 1e-06, + "loss": 0.9054, + "mean_token_accuracy": 0.7151052355766296, + "num_tokens": 398569346.0, + "step": 15745 + }, + { + "epoch": 1.729189545354711, + "grad_norm": 2.2060673236846924, + "learning_rate": 1e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.721254825592041, + "num_tokens": 398595150.0, + "step": 15746 + }, + { + "epoch": 1.7292993630573248, + "grad_norm": 2.219895839691162, + "learning_rate": 1e-06, + "loss": 0.8531, + "mean_token_accuracy": 0.7335011959075928, + "num_tokens": 398619677.0, + "step": 15747 + }, + { + "epoch": 1.7294091807599385, + "grad_norm": 2.079631805419922, + "learning_rate": 1e-06, + "loss": 0.9612, + "mean_token_accuracy": 0.7034516334533691, + "num_tokens": 398647403.0, + "step": 15748 + }, + { + "epoch": 1.7295189984625523, + "grad_norm": 1.9543800354003906, + "learning_rate": 1e-06, + "loss": 0.941, + "mean_token_accuracy": 0.7056182026863098, + "num_tokens": 398679085.0, + "step": 15749 + }, + { + "epoch": 1.7296288161651658, + "grad_norm": 2.1435623168945312, + "learning_rate": 1e-06, + "loss": 0.8942, + "mean_token_accuracy": 0.7183448672294617, + "num_tokens": 398706611.0, + "step": 15750 + }, + { + "epoch": 1.7297386338677794, + "grad_norm": 2.1003260612487793, + "learning_rate": 1e-06, + "loss": 0.9552, + "mean_token_accuracy": 0.7115583419799805, + "num_tokens": 398735590.0, + "step": 15751 + }, + { + "epoch": 1.7298484515703931, + "grad_norm": 2.0046350955963135, + "learning_rate": 1e-06, + "loss": 1.0016, + "mean_token_accuracy": 0.6928718686103821, + "num_tokens": 398768441.0, + "step": 15752 + }, + { + "epoch": 1.7299582692730069, + "grad_norm": 2.1099114418029785, + "learning_rate": 1e-06, + "loss": 0.98, + "mean_token_accuracy": 0.6990410685539246, + "num_tokens": 398797674.0, + "step": 15753 + }, + { + "epoch": 1.7300680869756204, + "grad_norm": 2.108389377593994, + "learning_rate": 1e-06, + "loss": 0.8444, + "mean_token_accuracy": 0.7326527833938599, + "num_tokens": 398826259.0, + "step": 15754 + }, + { + "epoch": 1.7301779046782342, + "grad_norm": 1.894830584526062, + "learning_rate": 1e-06, + "loss": 1.0153, + "mean_token_accuracy": 0.6912481784820557, + "num_tokens": 398859506.0, + "step": 15755 + }, + { + "epoch": 1.7302877223808477, + "grad_norm": 2.1271088123321533, + "learning_rate": 1e-06, + "loss": 0.8557, + "mean_token_accuracy": 0.7291771173477173, + "num_tokens": 398885930.0, + "step": 15756 + }, + { + "epoch": 1.7303975400834615, + "grad_norm": 2.4535841941833496, + "learning_rate": 1e-06, + "loss": 0.8576, + "mean_token_accuracy": 0.7365133166313171, + "num_tokens": 398907800.0, + "step": 15757 + }, + { + "epoch": 1.7305073577860752, + "grad_norm": 2.3771214485168457, + "learning_rate": 1e-06, + "loss": 0.8737, + "mean_token_accuracy": 0.7291416525840759, + "num_tokens": 398932805.0, + "step": 15758 + }, + { + "epoch": 1.7306171754886888, + "grad_norm": 2.072324752807617, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.6908934116363525, + "num_tokens": 398963716.0, + "step": 15759 + }, + { + "epoch": 1.7307269931913023, + "grad_norm": 2.3338868618011475, + "learning_rate": 1e-06, + "loss": 0.8846, + "mean_token_accuracy": 0.7282239198684692, + "num_tokens": 398987086.0, + "step": 15760 + }, + { + "epoch": 1.730836810893916, + "grad_norm": 2.2383484840393066, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7092524766921997, + "num_tokens": 399015962.0, + "step": 15761 + }, + { + "epoch": 1.7309466285965298, + "grad_norm": 2.2243216037750244, + "learning_rate": 1e-06, + "loss": 0.8821, + "mean_token_accuracy": 0.7237707376480103, + "num_tokens": 399044004.0, + "step": 15762 + }, + { + "epoch": 1.7310564462991436, + "grad_norm": 2.7420387268066406, + "learning_rate": 1e-06, + "loss": 0.8228, + "mean_token_accuracy": 0.7506004571914673, + "num_tokens": 399061601.0, + "step": 15763 + }, + { + "epoch": 1.731166264001757, + "grad_norm": 2.1798887252807617, + "learning_rate": 1e-06, + "loss": 0.9272, + "mean_token_accuracy": 0.7114093899726868, + "num_tokens": 399091269.0, + "step": 15764 + }, + { + "epoch": 1.7312760817043706, + "grad_norm": 2.070828914642334, + "learning_rate": 1e-06, + "loss": 0.9657, + "mean_token_accuracy": 0.7000431418418884, + "num_tokens": 399121771.0, + "step": 15765 + }, + { + "epoch": 1.7313858994069844, + "grad_norm": 2.369889497756958, + "learning_rate": 1e-06, + "loss": 0.837, + "mean_token_accuracy": 0.7419586777687073, + "num_tokens": 399145586.0, + "step": 15766 + }, + { + "epoch": 1.7314957171095982, + "grad_norm": 2.0846259593963623, + "learning_rate": 1e-06, + "loss": 0.8471, + "mean_token_accuracy": 0.7374308109283447, + "num_tokens": 399171713.0, + "step": 15767 + }, + { + "epoch": 1.7316055348122117, + "grad_norm": 2.07236385345459, + "learning_rate": 1e-06, + "loss": 0.9218, + "mean_token_accuracy": 0.7162641286849976, + "num_tokens": 399205368.0, + "step": 15768 + }, + { + "epoch": 1.7317153525148252, + "grad_norm": 1.96523118019104, + "learning_rate": 1e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.7031823396682739, + "num_tokens": 399239522.0, + "step": 15769 + }, + { + "epoch": 1.731825170217439, + "grad_norm": 2.4613194465637207, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.7084891200065613, + "num_tokens": 399262554.0, + "step": 15770 + }, + { + "epoch": 1.7319349879200527, + "grad_norm": 2.308228015899658, + "learning_rate": 1e-06, + "loss": 0.8519, + "mean_token_accuracy": 0.7331221103668213, + "num_tokens": 399286447.0, + "step": 15771 + }, + { + "epoch": 1.7320448056226665, + "grad_norm": 2.2841343879699707, + "learning_rate": 1e-06, + "loss": 0.7808, + "mean_token_accuracy": 0.7524490356445312, + "num_tokens": 399310914.0, + "step": 15772 + }, + { + "epoch": 1.73215462332528, + "grad_norm": 2.0442163944244385, + "learning_rate": 1e-06, + "loss": 0.912, + "mean_token_accuracy": 0.7114753127098083, + "num_tokens": 399340112.0, + "step": 15773 + }, + { + "epoch": 1.7322644410278936, + "grad_norm": 2.022982358932495, + "learning_rate": 1e-06, + "loss": 0.8796, + "mean_token_accuracy": 0.7209113240242004, + "num_tokens": 399372150.0, + "step": 15774 + }, + { + "epoch": 1.7323742587305073, + "grad_norm": 2.3396430015563965, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.7041182518005371, + "num_tokens": 399396617.0, + "step": 15775 + }, + { + "epoch": 1.732484076433121, + "grad_norm": 2.2139625549316406, + "learning_rate": 1e-06, + "loss": 0.8207, + "mean_token_accuracy": 0.7428550720214844, + "num_tokens": 399421921.0, + "step": 15776 + }, + { + "epoch": 1.7325938941357348, + "grad_norm": 2.006624698638916, + "learning_rate": 1e-06, + "loss": 0.8987, + "mean_token_accuracy": 0.7311164140701294, + "num_tokens": 399451044.0, + "step": 15777 + }, + { + "epoch": 1.7327037118383484, + "grad_norm": 2.401308298110962, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7044868469238281, + "num_tokens": 399474929.0, + "step": 15778 + }, + { + "epoch": 1.732813529540962, + "grad_norm": 2.5952398777008057, + "learning_rate": 1e-06, + "loss": 0.9195, + "mean_token_accuracy": 0.7249513864517212, + "num_tokens": 399495300.0, + "step": 15779 + }, + { + "epoch": 1.7329233472435757, + "grad_norm": 2.4420409202575684, + "learning_rate": 1e-06, + "loss": 0.9356, + "mean_token_accuracy": 0.7134088277816772, + "num_tokens": 399518133.0, + "step": 15780 + }, + { + "epoch": 1.7330331649461894, + "grad_norm": 2.1276938915252686, + "learning_rate": 1e-06, + "loss": 0.8798, + "mean_token_accuracy": 0.7200405597686768, + "num_tokens": 399544778.0, + "step": 15781 + }, + { + "epoch": 1.733142982648803, + "grad_norm": 2.20290470123291, + "learning_rate": 1e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.7239214181900024, + "num_tokens": 399570261.0, + "step": 15782 + }, + { + "epoch": 1.7332528003514165, + "grad_norm": 2.386732339859009, + "learning_rate": 1e-06, + "loss": 0.87, + "mean_token_accuracy": 0.7371790409088135, + "num_tokens": 399591996.0, + "step": 15783 + }, + { + "epoch": 1.7333626180540302, + "grad_norm": 2.57891845703125, + "learning_rate": 1e-06, + "loss": 0.7985, + "mean_token_accuracy": 0.7436462044715881, + "num_tokens": 399610877.0, + "step": 15784 + }, + { + "epoch": 1.733472435756644, + "grad_norm": 2.139946937561035, + "learning_rate": 1e-06, + "loss": 0.8718, + "mean_token_accuracy": 0.7355246543884277, + "num_tokens": 399637481.0, + "step": 15785 + }, + { + "epoch": 1.7335822534592578, + "grad_norm": 2.237420082092285, + "learning_rate": 1e-06, + "loss": 0.7473, + "mean_token_accuracy": 0.7576172351837158, + "num_tokens": 399660581.0, + "step": 15786 + }, + { + "epoch": 1.7336920711618713, + "grad_norm": 2.3060503005981445, + "learning_rate": 1e-06, + "loss": 0.8995, + "mean_token_accuracy": 0.7121269702911377, + "num_tokens": 399684408.0, + "step": 15787 + }, + { + "epoch": 1.7338018888644848, + "grad_norm": 2.0175364017486572, + "learning_rate": 1e-06, + "loss": 0.9902, + "mean_token_accuracy": 0.6968882083892822, + "num_tokens": 399716019.0, + "step": 15788 + }, + { + "epoch": 1.7339117065670986, + "grad_norm": 2.4024219512939453, + "learning_rate": 1e-06, + "loss": 0.8604, + "mean_token_accuracy": 0.731684684753418, + "num_tokens": 399737846.0, + "step": 15789 + }, + { + "epoch": 1.7340215242697123, + "grad_norm": 2.121110200881958, + "learning_rate": 1e-06, + "loss": 0.937, + "mean_token_accuracy": 0.7041236758232117, + "num_tokens": 399767999.0, + "step": 15790 + }, + { + "epoch": 1.734131341972326, + "grad_norm": 2.3776350021362305, + "learning_rate": 1e-06, + "loss": 0.876, + "mean_token_accuracy": 0.715997040271759, + "num_tokens": 399790313.0, + "step": 15791 + }, + { + "epoch": 1.7342411596749396, + "grad_norm": 2.262080192565918, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7131951451301575, + "num_tokens": 399814949.0, + "step": 15792 + }, + { + "epoch": 1.7343509773775532, + "grad_norm": 2.1496574878692627, + "learning_rate": 1e-06, + "loss": 0.8706, + "mean_token_accuracy": 0.7252554297447205, + "num_tokens": 399842542.0, + "step": 15793 + }, + { + "epoch": 1.734460795080167, + "grad_norm": 2.4572339057922363, + "learning_rate": 1e-06, + "loss": 0.9064, + "mean_token_accuracy": 0.7223840355873108, + "num_tokens": 399865765.0, + "step": 15794 + }, + { + "epoch": 1.7345706127827807, + "grad_norm": 1.9340440034866333, + "learning_rate": 1e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.7007993459701538, + "num_tokens": 399900790.0, + "step": 15795 + }, + { + "epoch": 1.7346804304853942, + "grad_norm": 2.283447027206421, + "learning_rate": 1e-06, + "loss": 0.8905, + "mean_token_accuracy": 0.7213354706764221, + "num_tokens": 399926699.0, + "step": 15796 + }, + { + "epoch": 1.7347902481880078, + "grad_norm": 2.5340797901153564, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.7160401344299316, + "num_tokens": 399947492.0, + "step": 15797 + }, + { + "epoch": 1.7349000658906215, + "grad_norm": 2.1489102840423584, + "learning_rate": 1e-06, + "loss": 0.87, + "mean_token_accuracy": 0.7324978113174438, + "num_tokens": 399974460.0, + "step": 15798 + }, + { + "epoch": 1.7350098835932353, + "grad_norm": 2.7611920833587646, + "learning_rate": 1e-06, + "loss": 0.9089, + "mean_token_accuracy": 0.7144271731376648, + "num_tokens": 399993427.0, + "step": 15799 + }, + { + "epoch": 1.735119701295849, + "grad_norm": 2.455514907836914, + "learning_rate": 1e-06, + "loss": 0.7936, + "mean_token_accuracy": 0.7442090511322021, + "num_tokens": 400013727.0, + "step": 15800 + }, + { + "epoch": 1.7352295189984626, + "grad_norm": 2.007378101348877, + "learning_rate": 1e-06, + "loss": 0.8611, + "mean_token_accuracy": 0.7309108376502991, + "num_tokens": 400042972.0, + "step": 15801 + }, + { + "epoch": 1.735339336701076, + "grad_norm": 2.105564832687378, + "learning_rate": 1e-06, + "loss": 0.8908, + "mean_token_accuracy": 0.7253202795982361, + "num_tokens": 400071470.0, + "step": 15802 + }, + { + "epoch": 1.7354491544036899, + "grad_norm": 2.1720328330993652, + "learning_rate": 1e-06, + "loss": 0.832, + "mean_token_accuracy": 0.7352460622787476, + "num_tokens": 400098631.0, + "step": 15803 + }, + { + "epoch": 1.7355589721063036, + "grad_norm": 2.156438112258911, + "learning_rate": 1e-06, + "loss": 0.8606, + "mean_token_accuracy": 0.7261964678764343, + "num_tokens": 400125762.0, + "step": 15804 + }, + { + "epoch": 1.7356687898089171, + "grad_norm": 2.1770076751708984, + "learning_rate": 1e-06, + "loss": 0.9499, + "mean_token_accuracy": 0.7041690349578857, + "num_tokens": 400153326.0, + "step": 15805 + }, + { + "epoch": 1.735778607511531, + "grad_norm": 2.4276537895202637, + "learning_rate": 1e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7139912247657776, + "num_tokens": 400176770.0, + "step": 15806 + }, + { + "epoch": 1.7358884252141444, + "grad_norm": 2.8249671459198, + "learning_rate": 1e-06, + "loss": 0.7941, + "mean_token_accuracy": 0.7412456274032593, + "num_tokens": 400193384.0, + "step": 15807 + }, + { + "epoch": 1.7359982429167582, + "grad_norm": 2.425022840499878, + "learning_rate": 1e-06, + "loss": 0.8671, + "mean_token_accuracy": 0.7350642681121826, + "num_tokens": 400216303.0, + "step": 15808 + }, + { + "epoch": 1.736108060619372, + "grad_norm": 2.2191684246063232, + "learning_rate": 1e-06, + "loss": 0.8927, + "mean_token_accuracy": 0.7184796333312988, + "num_tokens": 400242475.0, + "step": 15809 + }, + { + "epoch": 1.7362178783219855, + "grad_norm": 2.0489144325256348, + "learning_rate": 1e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.7222102284431458, + "num_tokens": 400270557.0, + "step": 15810 + }, + { + "epoch": 1.736327696024599, + "grad_norm": 2.062361717224121, + "learning_rate": 1e-06, + "loss": 0.8833, + "mean_token_accuracy": 0.7229565978050232, + "num_tokens": 400302067.0, + "step": 15811 + }, + { + "epoch": 1.7364375137272128, + "grad_norm": 2.0879197120666504, + "learning_rate": 1e-06, + "loss": 0.9058, + "mean_token_accuracy": 0.7194784283638, + "num_tokens": 400330744.0, + "step": 15812 + }, + { + "epoch": 1.7365473314298265, + "grad_norm": 2.099226236343384, + "learning_rate": 1e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.7129835486412048, + "num_tokens": 400359799.0, + "step": 15813 + }, + { + "epoch": 1.7366571491324403, + "grad_norm": 2.0580272674560547, + "learning_rate": 1e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.6975951194763184, + "num_tokens": 400389720.0, + "step": 15814 + }, + { + "epoch": 1.7367669668350538, + "grad_norm": 2.064957618713379, + "learning_rate": 1e-06, + "loss": 0.8726, + "mean_token_accuracy": 0.7265474796295166, + "num_tokens": 400418309.0, + "step": 15815 + }, + { + "epoch": 1.7368767845376674, + "grad_norm": 2.3904941082000732, + "learning_rate": 1e-06, + "loss": 0.8931, + "mean_token_accuracy": 0.7176898717880249, + "num_tokens": 400442351.0, + "step": 15816 + }, + { + "epoch": 1.7369866022402811, + "grad_norm": 2.282247304916382, + "learning_rate": 1e-06, + "loss": 0.9131, + "mean_token_accuracy": 0.7243340611457825, + "num_tokens": 400467063.0, + "step": 15817 + }, + { + "epoch": 1.7370964199428949, + "grad_norm": 2.371171236038208, + "learning_rate": 1e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7069392800331116, + "num_tokens": 400489988.0, + "step": 15818 + }, + { + "epoch": 1.7372062376455084, + "grad_norm": 2.5622806549072266, + "learning_rate": 1e-06, + "loss": 0.8364, + "mean_token_accuracy": 0.7305439710617065, + "num_tokens": 400509674.0, + "step": 15819 + }, + { + "epoch": 1.7373160553481222, + "grad_norm": 2.053297519683838, + "learning_rate": 1e-06, + "loss": 1.0061, + "mean_token_accuracy": 0.6955553293228149, + "num_tokens": 400542643.0, + "step": 15820 + }, + { + "epoch": 1.7374258730507357, + "grad_norm": 2.4798429012298584, + "learning_rate": 1e-06, + "loss": 0.8901, + "mean_token_accuracy": 0.71486896276474, + "num_tokens": 400563750.0, + "step": 15821 + }, + { + "epoch": 1.7375356907533495, + "grad_norm": 2.1878445148468018, + "learning_rate": 1e-06, + "loss": 0.887, + "mean_token_accuracy": 0.7246915102005005, + "num_tokens": 400593624.0, + "step": 15822 + }, + { + "epoch": 1.7376455084559632, + "grad_norm": 2.2506370544433594, + "learning_rate": 1e-06, + "loss": 0.8263, + "mean_token_accuracy": 0.7424871325492859, + "num_tokens": 400618900.0, + "step": 15823 + }, + { + "epoch": 1.7377553261585768, + "grad_norm": 2.264958143234253, + "learning_rate": 1e-06, + "loss": 0.8833, + "mean_token_accuracy": 0.721407413482666, + "num_tokens": 400644690.0, + "step": 15824 + }, + { + "epoch": 1.7378651438611903, + "grad_norm": 2.4336133003234863, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.702418327331543, + "num_tokens": 400670140.0, + "step": 15825 + }, + { + "epoch": 1.737974961563804, + "grad_norm": 2.0849239826202393, + "learning_rate": 1e-06, + "loss": 0.97, + "mean_token_accuracy": 0.705813467502594, + "num_tokens": 400700426.0, + "step": 15826 + }, + { + "epoch": 1.7380847792664178, + "grad_norm": 2.269289016723633, + "learning_rate": 1e-06, + "loss": 0.8668, + "mean_token_accuracy": 0.7261939644813538, + "num_tokens": 400723885.0, + "step": 15827 + }, + { + "epoch": 1.7381945969690316, + "grad_norm": 2.5927419662475586, + "learning_rate": 1e-06, + "loss": 0.861, + "mean_token_accuracy": 0.7330842018127441, + "num_tokens": 400745800.0, + "step": 15828 + }, + { + "epoch": 1.738304414671645, + "grad_norm": 2.0567171573638916, + "learning_rate": 1e-06, + "loss": 0.8581, + "mean_token_accuracy": 0.7293981313705444, + "num_tokens": 400772998.0, + "step": 15829 + }, + { + "epoch": 1.7384142323742586, + "grad_norm": 1.9784646034240723, + "learning_rate": 1e-06, + "loss": 0.8885, + "mean_token_accuracy": 0.7205651998519897, + "num_tokens": 400803016.0, + "step": 15830 + }, + { + "epoch": 1.7385240500768724, + "grad_norm": 2.2648122310638428, + "learning_rate": 1e-06, + "loss": 0.8753, + "mean_token_accuracy": 0.72528475522995, + "num_tokens": 400829391.0, + "step": 15831 + }, + { + "epoch": 1.7386338677794861, + "grad_norm": 2.262234926223755, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7156096696853638, + "num_tokens": 400855514.0, + "step": 15832 + }, + { + "epoch": 1.7387436854820997, + "grad_norm": 2.424309492111206, + "learning_rate": 1e-06, + "loss": 0.9022, + "mean_token_accuracy": 0.7187986373901367, + "num_tokens": 400878565.0, + "step": 15833 + }, + { + "epoch": 1.7388535031847132, + "grad_norm": 2.607271194458008, + "learning_rate": 1e-06, + "loss": 0.8398, + "mean_token_accuracy": 0.7357428669929504, + "num_tokens": 400899882.0, + "step": 15834 + }, + { + "epoch": 1.738963320887327, + "grad_norm": 2.295330762863159, + "learning_rate": 1e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.7133572697639465, + "num_tokens": 400925441.0, + "step": 15835 + }, + { + "epoch": 1.7390731385899407, + "grad_norm": 2.1584267616271973, + "learning_rate": 1e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.7103540301322937, + "num_tokens": 400954181.0, + "step": 15836 + }, + { + "epoch": 1.7391829562925545, + "grad_norm": 2.1813852787017822, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7096966505050659, + "num_tokens": 400982215.0, + "step": 15837 + }, + { + "epoch": 1.739292773995168, + "grad_norm": 2.3375661373138428, + "learning_rate": 1e-06, + "loss": 0.9085, + "mean_token_accuracy": 0.7252078056335449, + "num_tokens": 401007340.0, + "step": 15838 + }, + { + "epoch": 1.7394025916977816, + "grad_norm": 2.422416925430298, + "learning_rate": 1e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.7225610017776489, + "num_tokens": 401030024.0, + "step": 15839 + }, + { + "epoch": 1.7395124094003953, + "grad_norm": 2.1392159461975098, + "learning_rate": 1e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.692865252494812, + "num_tokens": 401057845.0, + "step": 15840 + }, + { + "epoch": 1.739622227103009, + "grad_norm": 2.710688829421997, + "learning_rate": 1e-06, + "loss": 0.7981, + "mean_token_accuracy": 0.744584321975708, + "num_tokens": 401077034.0, + "step": 15841 + }, + { + "epoch": 1.7397320448056228, + "grad_norm": 2.1921935081481934, + "learning_rate": 1e-06, + "loss": 0.8777, + "mean_token_accuracy": 0.7260043025016785, + "num_tokens": 401102797.0, + "step": 15842 + }, + { + "epoch": 1.7398418625082364, + "grad_norm": 2.13246488571167, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7135251760482788, + "num_tokens": 401131385.0, + "step": 15843 + }, + { + "epoch": 1.73995168021085, + "grad_norm": 2.3245511054992676, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.7161105871200562, + "num_tokens": 401158534.0, + "step": 15844 + }, + { + "epoch": 1.7400614979134637, + "grad_norm": 1.9076693058013916, + "learning_rate": 1e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.7119855880737305, + "num_tokens": 401190365.0, + "step": 15845 + }, + { + "epoch": 1.7401713156160774, + "grad_norm": 2.355379581451416, + "learning_rate": 1e-06, + "loss": 0.8398, + "mean_token_accuracy": 0.744098424911499, + "num_tokens": 401212791.0, + "step": 15846 + }, + { + "epoch": 1.740281133318691, + "grad_norm": 2.262387752532959, + "learning_rate": 1e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.6959136128425598, + "num_tokens": 401238025.0, + "step": 15847 + }, + { + "epoch": 1.7403909510213045, + "grad_norm": 2.3002331256866455, + "learning_rate": 1e-06, + "loss": 0.901, + "mean_token_accuracy": 0.7202340960502625, + "num_tokens": 401263411.0, + "step": 15848 + }, + { + "epoch": 1.7405007687239182, + "grad_norm": 2.008784055709839, + "learning_rate": 1e-06, + "loss": 0.748, + "mean_token_accuracy": 0.7622550129890442, + "num_tokens": 401291095.0, + "step": 15849 + }, + { + "epoch": 1.740610586426532, + "grad_norm": 2.178762912750244, + "learning_rate": 1e-06, + "loss": 0.857, + "mean_token_accuracy": 0.7259088754653931, + "num_tokens": 401315779.0, + "step": 15850 + }, + { + "epoch": 1.7407204041291457, + "grad_norm": 2.062725305557251, + "learning_rate": 1e-06, + "loss": 0.8969, + "mean_token_accuracy": 0.7186920642852783, + "num_tokens": 401346160.0, + "step": 15851 + }, + { + "epoch": 1.7408302218317593, + "grad_norm": 2.115022659301758, + "learning_rate": 1e-06, + "loss": 0.8984, + "mean_token_accuracy": 0.7119960784912109, + "num_tokens": 401372283.0, + "step": 15852 + }, + { + "epoch": 1.7409400395343728, + "grad_norm": 2.668306350708008, + "learning_rate": 1e-06, + "loss": 0.8706, + "mean_token_accuracy": 0.7206952571868896, + "num_tokens": 401393023.0, + "step": 15853 + }, + { + "epoch": 1.7410498572369866, + "grad_norm": 2.0660359859466553, + "learning_rate": 1e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.7008308172225952, + "num_tokens": 401421561.0, + "step": 15854 + }, + { + "epoch": 1.7411596749396003, + "grad_norm": 2.364607572555542, + "learning_rate": 1e-06, + "loss": 0.8543, + "mean_token_accuracy": 0.7457529902458191, + "num_tokens": 401443959.0, + "step": 15855 + }, + { + "epoch": 1.741269492642214, + "grad_norm": 2.438758611679077, + "learning_rate": 1e-06, + "loss": 0.8466, + "mean_token_accuracy": 0.7384427785873413, + "num_tokens": 401466010.0, + "step": 15856 + }, + { + "epoch": 1.7413793103448276, + "grad_norm": 2.200331926345825, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7152114510536194, + "num_tokens": 401492368.0, + "step": 15857 + }, + { + "epoch": 1.7414891280474412, + "grad_norm": 2.1531383991241455, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7031151056289673, + "num_tokens": 401521646.0, + "step": 15858 + }, + { + "epoch": 1.741598945750055, + "grad_norm": 2.482286214828491, + "learning_rate": 1e-06, + "loss": 0.8725, + "mean_token_accuracy": 0.7265900373458862, + "num_tokens": 401543475.0, + "step": 15859 + }, + { + "epoch": 1.7417087634526687, + "grad_norm": 2.1542317867279053, + "learning_rate": 1e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.7350162267684937, + "num_tokens": 401569704.0, + "step": 15860 + }, + { + "epoch": 1.7418185811552822, + "grad_norm": 2.3646130561828613, + "learning_rate": 1e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.7211177349090576, + "num_tokens": 401592672.0, + "step": 15861 + }, + { + "epoch": 1.7419283988578957, + "grad_norm": 2.973836660385132, + "learning_rate": 1e-06, + "loss": 0.8265, + "mean_token_accuracy": 0.7372883558273315, + "num_tokens": 401608934.0, + "step": 15862 + }, + { + "epoch": 1.7420382165605095, + "grad_norm": 2.135179042816162, + "learning_rate": 1e-06, + "loss": 0.8837, + "mean_token_accuracy": 0.7215601205825806, + "num_tokens": 401635989.0, + "step": 15863 + }, + { + "epoch": 1.7421480342631233, + "grad_norm": 1.9909591674804688, + "learning_rate": 1e-06, + "loss": 0.9629, + "mean_token_accuracy": 0.7159324884414673, + "num_tokens": 401667939.0, + "step": 15864 + }, + { + "epoch": 1.742257851965737, + "grad_norm": 2.3521785736083984, + "learning_rate": 1e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7171595096588135, + "num_tokens": 401691237.0, + "step": 15865 + }, + { + "epoch": 1.7423676696683505, + "grad_norm": 2.2358267307281494, + "learning_rate": 1e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7164366245269775, + "num_tokens": 401716574.0, + "step": 15866 + }, + { + "epoch": 1.742477487370964, + "grad_norm": 1.9430348873138428, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7056412696838379, + "num_tokens": 401750674.0, + "step": 15867 + }, + { + "epoch": 1.7425873050735778, + "grad_norm": 2.214602470397949, + "learning_rate": 1e-06, + "loss": 0.8709, + "mean_token_accuracy": 0.7270692586898804, + "num_tokens": 401774832.0, + "step": 15868 + }, + { + "epoch": 1.7426971227761916, + "grad_norm": 2.1420881748199463, + "learning_rate": 1e-06, + "loss": 1.0392, + "mean_token_accuracy": 0.6769241094589233, + "num_tokens": 401805510.0, + "step": 15869 + }, + { + "epoch": 1.7428069404788051, + "grad_norm": 2.545792818069458, + "learning_rate": 1e-06, + "loss": 0.8452, + "mean_token_accuracy": 0.7329782247543335, + "num_tokens": 401827141.0, + "step": 15870 + }, + { + "epoch": 1.742916758181419, + "grad_norm": 2.0611135959625244, + "learning_rate": 1e-06, + "loss": 0.832, + "mean_token_accuracy": 0.7407814860343933, + "num_tokens": 401854762.0, + "step": 15871 + }, + { + "epoch": 1.7430265758840324, + "grad_norm": 2.2399752140045166, + "learning_rate": 1e-06, + "loss": 0.8866, + "mean_token_accuracy": 0.7223575711250305, + "num_tokens": 401879837.0, + "step": 15872 + }, + { + "epoch": 1.7431363935866462, + "grad_norm": 2.518773078918457, + "learning_rate": 1e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.7099791765213013, + "num_tokens": 401901969.0, + "step": 15873 + }, + { + "epoch": 1.74324621128926, + "grad_norm": 2.4043328762054443, + "learning_rate": 1e-06, + "loss": 0.8502, + "mean_token_accuracy": 0.7343434691429138, + "num_tokens": 401924181.0, + "step": 15874 + }, + { + "epoch": 1.7433560289918735, + "grad_norm": 2.1650326251983643, + "learning_rate": 1e-06, + "loss": 0.8414, + "mean_token_accuracy": 0.7447269558906555, + "num_tokens": 401948809.0, + "step": 15875 + }, + { + "epoch": 1.743465846694487, + "grad_norm": 2.395646095275879, + "learning_rate": 1e-06, + "loss": 0.874, + "mean_token_accuracy": 0.728400468826294, + "num_tokens": 401970118.0, + "step": 15876 + }, + { + "epoch": 1.7435756643971008, + "grad_norm": 2.181464672088623, + "learning_rate": 1e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.7052053213119507, + "num_tokens": 401998665.0, + "step": 15877 + }, + { + "epoch": 1.7436854820997145, + "grad_norm": 2.421161651611328, + "learning_rate": 1e-06, + "loss": 0.8656, + "mean_token_accuracy": 0.7217246890068054, + "num_tokens": 402020826.0, + "step": 15878 + }, + { + "epoch": 1.7437952998023283, + "grad_norm": 2.3801534175872803, + "learning_rate": 1e-06, + "loss": 0.8959, + "mean_token_accuracy": 0.7178666591644287, + "num_tokens": 402044690.0, + "step": 15879 + }, + { + "epoch": 1.7439051175049418, + "grad_norm": 2.2192914485931396, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7136932611465454, + "num_tokens": 402071184.0, + "step": 15880 + }, + { + "epoch": 1.7440149352075554, + "grad_norm": 2.0843772888183594, + "learning_rate": 1e-06, + "loss": 0.9348, + "mean_token_accuracy": 0.7123099565505981, + "num_tokens": 402101267.0, + "step": 15881 + }, + { + "epoch": 1.744124752910169, + "grad_norm": 2.6029856204986572, + "learning_rate": 1e-06, + "loss": 0.8718, + "mean_token_accuracy": 0.7189176082611084, + "num_tokens": 402122039.0, + "step": 15882 + }, + { + "epoch": 1.7442345706127829, + "grad_norm": 2.4973719120025635, + "learning_rate": 1e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7195569276809692, + "num_tokens": 402144055.0, + "step": 15883 + }, + { + "epoch": 1.7443443883153964, + "grad_norm": 2.1339261531829834, + "learning_rate": 1e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7087419033050537, + "num_tokens": 402171596.0, + "step": 15884 + }, + { + "epoch": 1.7444542060180102, + "grad_norm": 2.633192300796509, + "learning_rate": 1e-06, + "loss": 0.7724, + "mean_token_accuracy": 0.7429817914962769, + "num_tokens": 402189767.0, + "step": 15885 + }, + { + "epoch": 1.7445640237206237, + "grad_norm": 2.706918716430664, + "learning_rate": 1e-06, + "loss": 0.8921, + "mean_token_accuracy": 0.7341955900192261, + "num_tokens": 402208543.0, + "step": 15886 + }, + { + "epoch": 1.7446738414232374, + "grad_norm": 2.3126134872436523, + "learning_rate": 1e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7117170095443726, + "num_tokens": 402233709.0, + "step": 15887 + }, + { + "epoch": 1.7447836591258512, + "grad_norm": 2.1743593215942383, + "learning_rate": 1e-06, + "loss": 0.8493, + "mean_token_accuracy": 0.7343333959579468, + "num_tokens": 402261647.0, + "step": 15888 + }, + { + "epoch": 1.7448934768284647, + "grad_norm": 2.648427963256836, + "learning_rate": 1e-06, + "loss": 0.886, + "mean_token_accuracy": 0.7336752414703369, + "num_tokens": 402282082.0, + "step": 15889 + }, + { + "epoch": 1.7450032945310783, + "grad_norm": 2.231152057647705, + "learning_rate": 1e-06, + "loss": 0.8086, + "mean_token_accuracy": 0.7458660006523132, + "num_tokens": 402306310.0, + "step": 15890 + }, + { + "epoch": 1.745113112233692, + "grad_norm": 2.2885117530822754, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7184082269668579, + "num_tokens": 402332984.0, + "step": 15891 + }, + { + "epoch": 1.7452229299363058, + "grad_norm": 2.3923418521881104, + "learning_rate": 1e-06, + "loss": 0.9097, + "mean_token_accuracy": 0.7139307260513306, + "num_tokens": 402357526.0, + "step": 15892 + }, + { + "epoch": 1.7453327476389195, + "grad_norm": 2.282292366027832, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.6982519626617432, + "num_tokens": 402387991.0, + "step": 15893 + }, + { + "epoch": 1.745442565341533, + "grad_norm": 2.7081916332244873, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7102005481719971, + "num_tokens": 402413592.0, + "step": 15894 + }, + { + "epoch": 1.7455523830441466, + "grad_norm": 2.2399849891662598, + "learning_rate": 1e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.7229323387145996, + "num_tokens": 402440079.0, + "step": 15895 + }, + { + "epoch": 1.7456622007467604, + "grad_norm": 2.2169270515441895, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7057821154594421, + "num_tokens": 402468582.0, + "step": 15896 + }, + { + "epoch": 1.7457720184493741, + "grad_norm": 2.1319873332977295, + "learning_rate": 1e-06, + "loss": 0.7848, + "mean_token_accuracy": 0.7467070817947388, + "num_tokens": 402494117.0, + "step": 15897 + }, + { + "epoch": 1.7458818361519877, + "grad_norm": 2.2874255180358887, + "learning_rate": 1e-06, + "loss": 0.9544, + "mean_token_accuracy": 0.7019475102424622, + "num_tokens": 402517682.0, + "step": 15898 + }, + { + "epoch": 1.7459916538546012, + "grad_norm": 2.1205825805664062, + "learning_rate": 1e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7132702469825745, + "num_tokens": 402547730.0, + "step": 15899 + }, + { + "epoch": 1.746101471557215, + "grad_norm": 2.073150396347046, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7024822235107422, + "num_tokens": 402580749.0, + "step": 15900 + }, + { + "epoch": 1.7462112892598287, + "grad_norm": 2.3101348876953125, + "learning_rate": 1e-06, + "loss": 0.8838, + "mean_token_accuracy": 0.7323476672172546, + "num_tokens": 402603492.0, + "step": 15901 + }, + { + "epoch": 1.7463211069624425, + "grad_norm": 2.1769206523895264, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7107877731323242, + "num_tokens": 402630239.0, + "step": 15902 + }, + { + "epoch": 1.746430924665056, + "grad_norm": 2.081998825073242, + "learning_rate": 1e-06, + "loss": 0.9413, + "mean_token_accuracy": 0.7092121243476868, + "num_tokens": 402659819.0, + "step": 15903 + }, + { + "epoch": 1.7465407423676695, + "grad_norm": 2.220226764678955, + "learning_rate": 1e-06, + "loss": 0.88, + "mean_token_accuracy": 0.7228938341140747, + "num_tokens": 402685927.0, + "step": 15904 + }, + { + "epoch": 1.7466505600702833, + "grad_norm": 2.6638574600219727, + "learning_rate": 1e-06, + "loss": 0.7469, + "mean_token_accuracy": 0.7596659064292908, + "num_tokens": 402703426.0, + "step": 15905 + }, + { + "epoch": 1.746760377772897, + "grad_norm": 2.223731279373169, + "learning_rate": 1e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.7141680717468262, + "num_tokens": 402731636.0, + "step": 15906 + }, + { + "epoch": 1.7468701954755108, + "grad_norm": 2.406480550765991, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.712466835975647, + "num_tokens": 402756525.0, + "step": 15907 + }, + { + "epoch": 1.7469800131781243, + "grad_norm": 2.330770969390869, + "learning_rate": 1e-06, + "loss": 0.8584, + "mean_token_accuracy": 0.7276548147201538, + "num_tokens": 402779501.0, + "step": 15908 + }, + { + "epoch": 1.7470898308807379, + "grad_norm": 2.660343647003174, + "learning_rate": 1e-06, + "loss": 0.849, + "mean_token_accuracy": 0.7365486025810242, + "num_tokens": 402798169.0, + "step": 15909 + }, + { + "epoch": 1.7471996485833516, + "grad_norm": 2.6330997943878174, + "learning_rate": 1e-06, + "loss": 0.8224, + "mean_token_accuracy": 0.7455710768699646, + "num_tokens": 402816391.0, + "step": 15910 + }, + { + "epoch": 1.7473094662859654, + "grad_norm": 1.9683337211608887, + "learning_rate": 1e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.71022629737854, + "num_tokens": 402848893.0, + "step": 15911 + }, + { + "epoch": 1.747419283988579, + "grad_norm": 2.1153171062469482, + "learning_rate": 1e-06, + "loss": 0.8577, + "mean_token_accuracy": 0.7275679707527161, + "num_tokens": 402874763.0, + "step": 15912 + }, + { + "epoch": 1.7475291016911925, + "grad_norm": 2.226231336593628, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7001581192016602, + "num_tokens": 402900860.0, + "step": 15913 + }, + { + "epoch": 1.7476389193938062, + "grad_norm": 2.261974573135376, + "learning_rate": 1e-06, + "loss": 0.9719, + "mean_token_accuracy": 0.7000494599342346, + "num_tokens": 402926556.0, + "step": 15914 + }, + { + "epoch": 1.74774873709642, + "grad_norm": 2.0190606117248535, + "learning_rate": 1e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.7007752656936646, + "num_tokens": 402959877.0, + "step": 15915 + }, + { + "epoch": 1.7478585547990337, + "grad_norm": 2.0389962196350098, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7032889127731323, + "num_tokens": 402988786.0, + "step": 15916 + }, + { + "epoch": 1.7479683725016473, + "grad_norm": 2.224094867706299, + "learning_rate": 1e-06, + "loss": 0.878, + "mean_token_accuracy": 0.7263092994689941, + "num_tokens": 403013388.0, + "step": 15917 + }, + { + "epoch": 1.7480781902042608, + "grad_norm": 2.3361175060272217, + "learning_rate": 1e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.7226740717887878, + "num_tokens": 403039411.0, + "step": 15918 + }, + { + "epoch": 1.7481880079068746, + "grad_norm": 2.0265026092529297, + "learning_rate": 1e-06, + "loss": 0.857, + "mean_token_accuracy": 0.727180004119873, + "num_tokens": 403069516.0, + "step": 15919 + }, + { + "epoch": 1.7482978256094883, + "grad_norm": 2.0352323055267334, + "learning_rate": 1e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.7169109582901001, + "num_tokens": 403097571.0, + "step": 15920 + }, + { + "epoch": 1.7484076433121019, + "grad_norm": 2.3955113887786865, + "learning_rate": 1e-06, + "loss": 0.84, + "mean_token_accuracy": 0.7360792756080627, + "num_tokens": 403120611.0, + "step": 15921 + }, + { + "epoch": 1.7485174610147156, + "grad_norm": 2.160881280899048, + "learning_rate": 1e-06, + "loss": 0.8777, + "mean_token_accuracy": 0.7273333072662354, + "num_tokens": 403144910.0, + "step": 15922 + }, + { + "epoch": 1.7486272787173291, + "grad_norm": 2.548464059829712, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.7122464179992676, + "num_tokens": 403166002.0, + "step": 15923 + }, + { + "epoch": 1.748737096419943, + "grad_norm": 2.581730842590332, + "learning_rate": 1e-06, + "loss": 0.934, + "mean_token_accuracy": 0.7036689519882202, + "num_tokens": 403186204.0, + "step": 15924 + }, + { + "epoch": 1.7488469141225567, + "grad_norm": 2.2358925342559814, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.6972103118896484, + "num_tokens": 403214452.0, + "step": 15925 + }, + { + "epoch": 1.7489567318251702, + "grad_norm": 2.082265853881836, + "learning_rate": 1e-06, + "loss": 0.944, + "mean_token_accuracy": 0.7149766683578491, + "num_tokens": 403242726.0, + "step": 15926 + }, + { + "epoch": 1.7490665495277837, + "grad_norm": 2.5404720306396484, + "learning_rate": 1e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.724122166633606, + "num_tokens": 403264846.0, + "step": 15927 + }, + { + "epoch": 1.7491763672303975, + "grad_norm": 2.2917590141296387, + "learning_rate": 1e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.7137084007263184, + "num_tokens": 403289041.0, + "step": 15928 + }, + { + "epoch": 1.7492861849330112, + "grad_norm": 2.300481081008911, + "learning_rate": 1e-06, + "loss": 0.8548, + "mean_token_accuracy": 0.726107120513916, + "num_tokens": 403314673.0, + "step": 15929 + }, + { + "epoch": 1.749396002635625, + "grad_norm": 2.328810214996338, + "learning_rate": 1e-06, + "loss": 0.9346, + "mean_token_accuracy": 0.711193859577179, + "num_tokens": 403339391.0, + "step": 15930 + }, + { + "epoch": 1.7495058203382385, + "grad_norm": 2.2761666774749756, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.7178536653518677, + "num_tokens": 403364693.0, + "step": 15931 + }, + { + "epoch": 1.749615638040852, + "grad_norm": 2.1770994663238525, + "learning_rate": 1e-06, + "loss": 0.8921, + "mean_token_accuracy": 0.7208770513534546, + "num_tokens": 403392105.0, + "step": 15932 + }, + { + "epoch": 1.7497254557434658, + "grad_norm": 2.1802308559417725, + "learning_rate": 1e-06, + "loss": 1.0203, + "mean_token_accuracy": 0.7007889151573181, + "num_tokens": 403420175.0, + "step": 15933 + }, + { + "epoch": 1.7498352734460796, + "grad_norm": 2.0371642112731934, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.703254759311676, + "num_tokens": 403451737.0, + "step": 15934 + }, + { + "epoch": 1.7499450911486931, + "grad_norm": 2.1043856143951416, + "learning_rate": 1e-06, + "loss": 0.7835, + "mean_token_accuracy": 0.7541301250457764, + "num_tokens": 403478189.0, + "step": 15935 + }, + { + "epoch": 1.7500549088513069, + "grad_norm": 2.1289222240448, + "learning_rate": 1e-06, + "loss": 0.9168, + "mean_token_accuracy": 0.7139589190483093, + "num_tokens": 403506736.0, + "step": 15936 + }, + { + "epoch": 1.7501647265539204, + "grad_norm": 2.501979351043701, + "learning_rate": 1e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.7255434393882751, + "num_tokens": 403528132.0, + "step": 15937 + }, + { + "epoch": 1.7502745442565342, + "grad_norm": 2.0161900520324707, + "learning_rate": 1e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.702675461769104, + "num_tokens": 403558326.0, + "step": 15938 + }, + { + "epoch": 1.750384361959148, + "grad_norm": 2.026747465133667, + "learning_rate": 1e-06, + "loss": 0.7931, + "mean_token_accuracy": 0.746970534324646, + "num_tokens": 403585696.0, + "step": 15939 + }, + { + "epoch": 1.7504941796617615, + "grad_norm": 2.2718608379364014, + "learning_rate": 1e-06, + "loss": 0.8592, + "mean_token_accuracy": 0.7284393310546875, + "num_tokens": 403610701.0, + "step": 15940 + }, + { + "epoch": 1.750603997364375, + "grad_norm": 2.212200164794922, + "learning_rate": 1e-06, + "loss": 0.8768, + "mean_token_accuracy": 0.7255966067314148, + "num_tokens": 403636527.0, + "step": 15941 + }, + { + "epoch": 1.7507138150669888, + "grad_norm": 2.4929072856903076, + "learning_rate": 1e-06, + "loss": 0.7128, + "mean_token_accuracy": 0.7596614360809326, + "num_tokens": 403655728.0, + "step": 15942 + }, + { + "epoch": 1.7508236327696025, + "grad_norm": 2.379706382751465, + "learning_rate": 1e-06, + "loss": 0.9378, + "mean_token_accuracy": 0.704373836517334, + "num_tokens": 403678722.0, + "step": 15943 + }, + { + "epoch": 1.7509334504722163, + "grad_norm": 2.11269474029541, + "learning_rate": 1e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.7135264873504639, + "num_tokens": 403706224.0, + "step": 15944 + }, + { + "epoch": 1.7510432681748298, + "grad_norm": 1.9896095991134644, + "learning_rate": 1e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7073796987533569, + "num_tokens": 403736287.0, + "step": 15945 + }, + { + "epoch": 1.7511530858774433, + "grad_norm": 2.4476332664489746, + "learning_rate": 1e-06, + "loss": 0.9045, + "mean_token_accuracy": 0.7316257953643799, + "num_tokens": 403758037.0, + "step": 15946 + }, + { + "epoch": 1.751262903580057, + "grad_norm": 2.5216641426086426, + "learning_rate": 1e-06, + "loss": 0.8399, + "mean_token_accuracy": 0.7443965673446655, + "num_tokens": 403778087.0, + "step": 15947 + }, + { + "epoch": 1.7513727212826709, + "grad_norm": 2.0298428535461426, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.699087917804718, + "num_tokens": 403806710.0, + "step": 15948 + }, + { + "epoch": 1.7514825389852844, + "grad_norm": 2.250622034072876, + "learning_rate": 1e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.7154579758644104, + "num_tokens": 403832742.0, + "step": 15949 + }, + { + "epoch": 1.7515923566878981, + "grad_norm": 2.3064777851104736, + "learning_rate": 1e-06, + "loss": 0.9664, + "mean_token_accuracy": 0.7038060426712036, + "num_tokens": 403859795.0, + "step": 15950 + }, + { + "epoch": 1.7517021743905117, + "grad_norm": 2.245393991470337, + "learning_rate": 1e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.7115060091018677, + "num_tokens": 403885751.0, + "step": 15951 + }, + { + "epoch": 1.7518119920931254, + "grad_norm": 2.202625274658203, + "learning_rate": 1e-06, + "loss": 0.8437, + "mean_token_accuracy": 0.7391254901885986, + "num_tokens": 403910321.0, + "step": 15952 + }, + { + "epoch": 1.7519218097957392, + "grad_norm": 2.2265095710754395, + "learning_rate": 1e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.7000267505645752, + "num_tokens": 403937382.0, + "step": 15953 + }, + { + "epoch": 1.7520316274983527, + "grad_norm": 2.3990848064422607, + "learning_rate": 1e-06, + "loss": 0.8878, + "mean_token_accuracy": 0.7263245582580566, + "num_tokens": 403959884.0, + "step": 15954 + }, + { + "epoch": 1.7521414452009663, + "grad_norm": 2.393493413925171, + "learning_rate": 1e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7202787399291992, + "num_tokens": 403982418.0, + "step": 15955 + }, + { + "epoch": 1.75225126290358, + "grad_norm": 2.2971863746643066, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.727331817150116, + "num_tokens": 404007011.0, + "step": 15956 + }, + { + "epoch": 1.7523610806061938, + "grad_norm": 2.1127657890319824, + "learning_rate": 1e-06, + "loss": 0.8696, + "mean_token_accuracy": 0.7300698757171631, + "num_tokens": 404033920.0, + "step": 15957 + }, + { + "epoch": 1.7524708983088075, + "grad_norm": 2.1809005737304688, + "learning_rate": 1e-06, + "loss": 0.8688, + "mean_token_accuracy": 0.7290360331535339, + "num_tokens": 404058885.0, + "step": 15958 + }, + { + "epoch": 1.752580716011421, + "grad_norm": 2.0347177982330322, + "learning_rate": 1e-06, + "loss": 1.0244, + "mean_token_accuracy": 0.687684178352356, + "num_tokens": 404091248.0, + "step": 15959 + }, + { + "epoch": 1.7526905337140346, + "grad_norm": 2.2189552783966064, + "learning_rate": 1e-06, + "loss": 0.902, + "mean_token_accuracy": 0.7247512340545654, + "num_tokens": 404117442.0, + "step": 15960 + }, + { + "epoch": 1.7528003514166484, + "grad_norm": 2.417315721511841, + "learning_rate": 1e-06, + "loss": 0.8607, + "mean_token_accuracy": 0.7257347702980042, + "num_tokens": 404138484.0, + "step": 15961 + }, + { + "epoch": 1.7529101691192621, + "grad_norm": 2.3829259872436523, + "learning_rate": 1e-06, + "loss": 0.9607, + "mean_token_accuracy": 0.6985264420509338, + "num_tokens": 404162307.0, + "step": 15962 + }, + { + "epoch": 1.7530199868218757, + "grad_norm": 1.923378586769104, + "learning_rate": 1e-06, + "loss": 1.0184, + "mean_token_accuracy": 0.6856875419616699, + "num_tokens": 404194577.0, + "step": 15963 + }, + { + "epoch": 1.7531298045244892, + "grad_norm": 1.9481440782546997, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.7005041241645813, + "num_tokens": 404228177.0, + "step": 15964 + }, + { + "epoch": 1.753239622227103, + "grad_norm": 2.5290122032165527, + "learning_rate": 1e-06, + "loss": 0.9131, + "mean_token_accuracy": 0.7169733643531799, + "num_tokens": 404249181.0, + "step": 15965 + }, + { + "epoch": 1.7533494399297167, + "grad_norm": 2.2571985721588135, + "learning_rate": 1e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.7188825607299805, + "num_tokens": 404273911.0, + "step": 15966 + }, + { + "epoch": 1.7534592576323305, + "grad_norm": 2.0458242893218994, + "learning_rate": 1e-06, + "loss": 0.8934, + "mean_token_accuracy": 0.7220571041107178, + "num_tokens": 404304447.0, + "step": 15967 + }, + { + "epoch": 1.753569075334944, + "grad_norm": 2.2588300704956055, + "learning_rate": 1e-06, + "loss": 0.8948, + "mean_token_accuracy": 0.7198922634124756, + "num_tokens": 404330635.0, + "step": 15968 + }, + { + "epoch": 1.7536788930375575, + "grad_norm": 2.1060056686401367, + "learning_rate": 1e-06, + "loss": 0.9479, + "mean_token_accuracy": 0.7043431997299194, + "num_tokens": 404359635.0, + "step": 15969 + }, + { + "epoch": 1.7537887107401713, + "grad_norm": 2.235374927520752, + "learning_rate": 1e-06, + "loss": 0.8749, + "mean_token_accuracy": 0.7238336205482483, + "num_tokens": 404384714.0, + "step": 15970 + }, + { + "epoch": 1.753898528442785, + "grad_norm": 2.3323514461517334, + "learning_rate": 1e-06, + "loss": 0.8604, + "mean_token_accuracy": 0.726453423500061, + "num_tokens": 404408883.0, + "step": 15971 + }, + { + "epoch": 1.7540083461453988, + "grad_norm": 2.432490348815918, + "learning_rate": 1e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.719740629196167, + "num_tokens": 404431447.0, + "step": 15972 + }, + { + "epoch": 1.7541181638480123, + "grad_norm": 2.489293098449707, + "learning_rate": 1e-06, + "loss": 0.9272, + "mean_token_accuracy": 0.7166682481765747, + "num_tokens": 404456326.0, + "step": 15973 + }, + { + "epoch": 1.7542279815506259, + "grad_norm": 2.4927480220794678, + "learning_rate": 1e-06, + "loss": 0.9002, + "mean_token_accuracy": 0.71870356798172, + "num_tokens": 404478822.0, + "step": 15974 + }, + { + "epoch": 1.7543377992532396, + "grad_norm": 2.041048049926758, + "learning_rate": 1e-06, + "loss": 0.8335, + "mean_token_accuracy": 0.7375813722610474, + "num_tokens": 404507364.0, + "step": 15975 + }, + { + "epoch": 1.7544476169558534, + "grad_norm": 2.4387290477752686, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.7060832977294922, + "num_tokens": 404531579.0, + "step": 15976 + }, + { + "epoch": 1.754557434658467, + "grad_norm": 2.2333476543426514, + "learning_rate": 1e-06, + "loss": 0.8746, + "mean_token_accuracy": 0.7230864763259888, + "num_tokens": 404556053.0, + "step": 15977 + }, + { + "epoch": 1.7546672523610805, + "grad_norm": 1.98802649974823, + "learning_rate": 1e-06, + "loss": 0.9586, + "mean_token_accuracy": 0.7021141052246094, + "num_tokens": 404587371.0, + "step": 15978 + }, + { + "epoch": 1.7547770700636942, + "grad_norm": 2.146653652191162, + "learning_rate": 1e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.7138286232948303, + "num_tokens": 404614237.0, + "step": 15979 + }, + { + "epoch": 1.754886887766308, + "grad_norm": 2.272873640060425, + "learning_rate": 1e-06, + "loss": 0.851, + "mean_token_accuracy": 0.7331454157829285, + "num_tokens": 404637232.0, + "step": 15980 + }, + { + "epoch": 1.7549967054689217, + "grad_norm": 2.5418922901153564, + "learning_rate": 1e-06, + "loss": 0.8304, + "mean_token_accuracy": 0.7414071559906006, + "num_tokens": 404658131.0, + "step": 15981 + }, + { + "epoch": 1.7551065231715353, + "grad_norm": 2.357635974884033, + "learning_rate": 1e-06, + "loss": 0.8536, + "mean_token_accuracy": 0.7259069681167603, + "num_tokens": 404682081.0, + "step": 15982 + }, + { + "epoch": 1.7552163408741488, + "grad_norm": 2.486910104751587, + "learning_rate": 1e-06, + "loss": 0.8292, + "mean_token_accuracy": 0.7344019412994385, + "num_tokens": 404703270.0, + "step": 15983 + }, + { + "epoch": 1.7553261585767626, + "grad_norm": 2.0237536430358887, + "learning_rate": 1e-06, + "loss": 0.8418, + "mean_token_accuracy": 0.7330849170684814, + "num_tokens": 404732334.0, + "step": 15984 + }, + { + "epoch": 1.7554359762793763, + "grad_norm": 2.0858023166656494, + "learning_rate": 1e-06, + "loss": 0.9919, + "mean_token_accuracy": 0.6965980529785156, + "num_tokens": 404764438.0, + "step": 15985 + }, + { + "epoch": 1.7555457939819898, + "grad_norm": 2.710526943206787, + "learning_rate": 1e-06, + "loss": 0.8312, + "mean_token_accuracy": 0.7376121282577515, + "num_tokens": 404784109.0, + "step": 15986 + }, + { + "epoch": 1.7556556116846036, + "grad_norm": 1.8785862922668457, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7051325440406799, + "num_tokens": 404816367.0, + "step": 15987 + }, + { + "epoch": 1.7557654293872171, + "grad_norm": 2.000838041305542, + "learning_rate": 1e-06, + "loss": 0.9282, + "mean_token_accuracy": 0.7068638205528259, + "num_tokens": 404848700.0, + "step": 15988 + }, + { + "epoch": 1.755875247089831, + "grad_norm": 2.1954548358917236, + "learning_rate": 1e-06, + "loss": 0.8727, + "mean_token_accuracy": 0.7286679744720459, + "num_tokens": 404875302.0, + "step": 15989 + }, + { + "epoch": 1.7559850647924446, + "grad_norm": 2.236358404159546, + "learning_rate": 1e-06, + "loss": 1.0138, + "mean_token_accuracy": 0.6887966394424438, + "num_tokens": 404903364.0, + "step": 15990 + }, + { + "epoch": 1.7560948824950582, + "grad_norm": 2.5842230319976807, + "learning_rate": 1e-06, + "loss": 0.8893, + "mean_token_accuracy": 0.7265554666519165, + "num_tokens": 404923887.0, + "step": 15991 + }, + { + "epoch": 1.7562047001976717, + "grad_norm": 2.2139062881469727, + "learning_rate": 1e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.7119116187095642, + "num_tokens": 404950716.0, + "step": 15992 + }, + { + "epoch": 1.7563145179002855, + "grad_norm": 2.334473133087158, + "learning_rate": 1e-06, + "loss": 0.8631, + "mean_token_accuracy": 0.7285525798797607, + "num_tokens": 404973529.0, + "step": 15993 + }, + { + "epoch": 1.7564243356028992, + "grad_norm": 2.2947309017181396, + "learning_rate": 1e-06, + "loss": 0.8619, + "mean_token_accuracy": 0.7335461378097534, + "num_tokens": 404998486.0, + "step": 15994 + }, + { + "epoch": 1.756534153305513, + "grad_norm": 2.35745906829834, + "learning_rate": 1e-06, + "loss": 0.8024, + "mean_token_accuracy": 0.740326464176178, + "num_tokens": 405022449.0, + "step": 15995 + }, + { + "epoch": 1.7566439710081265, + "grad_norm": 2.19165301322937, + "learning_rate": 1e-06, + "loss": 0.9217, + "mean_token_accuracy": 0.710899829864502, + "num_tokens": 405046900.0, + "step": 15996 + }, + { + "epoch": 1.75675378871074, + "grad_norm": 2.4674952030181885, + "learning_rate": 1e-06, + "loss": 0.8836, + "mean_token_accuracy": 0.727361798286438, + "num_tokens": 405068236.0, + "step": 15997 + }, + { + "epoch": 1.7568636064133538, + "grad_norm": 2.3568177223205566, + "learning_rate": 1e-06, + "loss": 0.8344, + "mean_token_accuracy": 0.7398507595062256, + "num_tokens": 405090462.0, + "step": 15998 + }, + { + "epoch": 1.7569734241159676, + "grad_norm": 2.571760892868042, + "learning_rate": 1e-06, + "loss": 0.7617, + "mean_token_accuracy": 0.7485643625259399, + "num_tokens": 405109355.0, + "step": 15999 + }, + { + "epoch": 1.757083241818581, + "grad_norm": 2.3838136196136475, + "learning_rate": 1e-06, + "loss": 0.8543, + "mean_token_accuracy": 0.7382329702377319, + "num_tokens": 405132172.0, + "step": 16000 + }, + { + "epoch": 1.7571930595211949, + "grad_norm": 2.0597312450408936, + "learning_rate": 1e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.6879193186759949, + "num_tokens": 405163106.0, + "step": 16001 + }, + { + "epoch": 1.7573028772238084, + "grad_norm": 2.48573637008667, + "learning_rate": 1e-06, + "loss": 0.8962, + "mean_token_accuracy": 0.7218990325927734, + "num_tokens": 405185081.0, + "step": 16002 + }, + { + "epoch": 1.7574126949264222, + "grad_norm": 2.233947992324829, + "learning_rate": 1e-06, + "loss": 0.9107, + "mean_token_accuracy": 0.7218471765518188, + "num_tokens": 405211656.0, + "step": 16003 + }, + { + "epoch": 1.757522512629036, + "grad_norm": 2.1177992820739746, + "learning_rate": 1e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7087197303771973, + "num_tokens": 405239775.0, + "step": 16004 + }, + { + "epoch": 1.7576323303316495, + "grad_norm": 2.16654109954834, + "learning_rate": 1e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7199735641479492, + "num_tokens": 405268043.0, + "step": 16005 + }, + { + "epoch": 1.757742148034263, + "grad_norm": 2.212461233139038, + "learning_rate": 1e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.7227466702461243, + "num_tokens": 405294343.0, + "step": 16006 + }, + { + "epoch": 1.7578519657368767, + "grad_norm": 2.5502922534942627, + "learning_rate": 1e-06, + "loss": 0.8732, + "mean_token_accuracy": 0.7240673899650574, + "num_tokens": 405317246.0, + "step": 16007 + }, + { + "epoch": 1.7579617834394905, + "grad_norm": 2.310340642929077, + "learning_rate": 1e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.7133191823959351, + "num_tokens": 405340977.0, + "step": 16008 + }, + { + "epoch": 1.7580716011421043, + "grad_norm": 2.3536643981933594, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7034719586372375, + "num_tokens": 405365936.0, + "step": 16009 + }, + { + "epoch": 1.7581814188447178, + "grad_norm": 2.501314640045166, + "learning_rate": 1e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.7130337357521057, + "num_tokens": 405387548.0, + "step": 16010 + }, + { + "epoch": 1.7582912365473313, + "grad_norm": 2.573540687561035, + "learning_rate": 1e-06, + "loss": 0.8485, + "mean_token_accuracy": 0.7311628460884094, + "num_tokens": 405407562.0, + "step": 16011 + }, + { + "epoch": 1.758401054249945, + "grad_norm": 2.1114840507507324, + "learning_rate": 1e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.6963804960250854, + "num_tokens": 405437715.0, + "step": 16012 + }, + { + "epoch": 1.7585108719525588, + "grad_norm": 2.4065191745758057, + "learning_rate": 1e-06, + "loss": 0.8733, + "mean_token_accuracy": 0.7286455631256104, + "num_tokens": 405459845.0, + "step": 16013 + }, + { + "epoch": 1.7586206896551724, + "grad_norm": 2.0688083171844482, + "learning_rate": 1e-06, + "loss": 0.8182, + "mean_token_accuracy": 0.7461963295936584, + "num_tokens": 405487965.0, + "step": 16014 + }, + { + "epoch": 1.758730507357786, + "grad_norm": 2.0544514656066895, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7096424698829651, + "num_tokens": 405516995.0, + "step": 16015 + }, + { + "epoch": 1.7588403250603997, + "grad_norm": 2.1519882678985596, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.7066829204559326, + "num_tokens": 405544267.0, + "step": 16016 + }, + { + "epoch": 1.7589501427630134, + "grad_norm": 2.0336246490478516, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.7139927744865417, + "num_tokens": 405575215.0, + "step": 16017 + }, + { + "epoch": 1.7590599604656272, + "grad_norm": 2.151611566543579, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.6991534233093262, + "num_tokens": 405603543.0, + "step": 16018 + }, + { + "epoch": 1.7591697781682407, + "grad_norm": 2.615373373031616, + "learning_rate": 1e-06, + "loss": 0.8264, + "mean_token_accuracy": 0.7385786771774292, + "num_tokens": 405623008.0, + "step": 16019 + }, + { + "epoch": 1.7592795958708543, + "grad_norm": 2.543240547180176, + "learning_rate": 1e-06, + "loss": 0.8593, + "mean_token_accuracy": 0.7234779000282288, + "num_tokens": 405643825.0, + "step": 16020 + }, + { + "epoch": 1.759389413573468, + "grad_norm": 2.5477304458618164, + "learning_rate": 1e-06, + "loss": 0.807, + "mean_token_accuracy": 0.7407010793685913, + "num_tokens": 405662642.0, + "step": 16021 + }, + { + "epoch": 1.7594992312760818, + "grad_norm": 2.2398524284362793, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7117022275924683, + "num_tokens": 405686414.0, + "step": 16022 + }, + { + "epoch": 1.7596090489786955, + "grad_norm": 2.086928606033325, + "learning_rate": 1e-06, + "loss": 0.9709, + "mean_token_accuracy": 0.7135871648788452, + "num_tokens": 405716565.0, + "step": 16023 + }, + { + "epoch": 1.759718866681309, + "grad_norm": 2.116785764694214, + "learning_rate": 1e-06, + "loss": 0.8035, + "mean_token_accuracy": 0.7443277835845947, + "num_tokens": 405742342.0, + "step": 16024 + }, + { + "epoch": 1.7598286843839226, + "grad_norm": 2.2591381072998047, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7035700082778931, + "num_tokens": 405770154.0, + "step": 16025 + }, + { + "epoch": 1.7599385020865363, + "grad_norm": 2.1883528232574463, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.710237979888916, + "num_tokens": 405796415.0, + "step": 16026 + }, + { + "epoch": 1.76004831978915, + "grad_norm": 2.6877036094665527, + "learning_rate": 1e-06, + "loss": 0.894, + "mean_token_accuracy": 0.7176527976989746, + "num_tokens": 405814779.0, + "step": 16027 + }, + { + "epoch": 1.7601581374917636, + "grad_norm": 1.889275074005127, + "learning_rate": 1e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.6976172924041748, + "num_tokens": 405849593.0, + "step": 16028 + }, + { + "epoch": 1.7602679551943772, + "grad_norm": 2.1059908866882324, + "learning_rate": 1e-06, + "loss": 0.923, + "mean_token_accuracy": 0.7168068885803223, + "num_tokens": 405880096.0, + "step": 16029 + }, + { + "epoch": 1.760377772896991, + "grad_norm": 2.228764533996582, + "learning_rate": 1e-06, + "loss": 0.8718, + "mean_token_accuracy": 0.7279943227767944, + "num_tokens": 405906012.0, + "step": 16030 + }, + { + "epoch": 1.7604875905996047, + "grad_norm": 2.025585889816284, + "learning_rate": 1e-06, + "loss": 0.8626, + "mean_token_accuracy": 0.7275859117507935, + "num_tokens": 405935432.0, + "step": 16031 + }, + { + "epoch": 1.7605974083022184, + "grad_norm": 2.242323160171509, + "learning_rate": 1e-06, + "loss": 0.8447, + "mean_token_accuracy": 0.7362488508224487, + "num_tokens": 405960729.0, + "step": 16032 + }, + { + "epoch": 1.760707226004832, + "grad_norm": 2.24588680267334, + "learning_rate": 1e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.7241171598434448, + "num_tokens": 405986396.0, + "step": 16033 + }, + { + "epoch": 1.7608170437074455, + "grad_norm": 2.1476476192474365, + "learning_rate": 1e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.7153260707855225, + "num_tokens": 406015140.0, + "step": 16034 + }, + { + "epoch": 1.7609268614100593, + "grad_norm": 2.081753969192505, + "learning_rate": 1e-06, + "loss": 0.8828, + "mean_token_accuracy": 0.7384620308876038, + "num_tokens": 406043170.0, + "step": 16035 + }, + { + "epoch": 1.761036679112673, + "grad_norm": 2.2530996799468994, + "learning_rate": 1e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.7206137776374817, + "num_tokens": 406069136.0, + "step": 16036 + }, + { + "epoch": 1.7611464968152868, + "grad_norm": 2.1424906253814697, + "learning_rate": 1e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.7022472620010376, + "num_tokens": 406098944.0, + "step": 16037 + }, + { + "epoch": 1.7612563145179003, + "grad_norm": 2.225639820098877, + "learning_rate": 1e-06, + "loss": 0.9014, + "mean_token_accuracy": 0.7176341414451599, + "num_tokens": 406126758.0, + "step": 16038 + }, + { + "epoch": 1.7613661322205139, + "grad_norm": 2.1349451541900635, + "learning_rate": 1e-06, + "loss": 0.9479, + "mean_token_accuracy": 0.7088024616241455, + "num_tokens": 406153815.0, + "step": 16039 + }, + { + "epoch": 1.7614759499231276, + "grad_norm": 2.3630266189575195, + "learning_rate": 1e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.7007281184196472, + "num_tokens": 406176608.0, + "step": 16040 + }, + { + "epoch": 1.7615857676257414, + "grad_norm": 2.1225013732910156, + "learning_rate": 1e-06, + "loss": 0.8818, + "mean_token_accuracy": 0.7219235301017761, + "num_tokens": 406203509.0, + "step": 16041 + }, + { + "epoch": 1.761695585328355, + "grad_norm": 2.2195847034454346, + "learning_rate": 1e-06, + "loss": 0.8564, + "mean_token_accuracy": 0.7338899374008179, + "num_tokens": 406227199.0, + "step": 16042 + }, + { + "epoch": 1.7618054030309684, + "grad_norm": 2.3030354976654053, + "learning_rate": 1e-06, + "loss": 0.8934, + "mean_token_accuracy": 0.7171098589897156, + "num_tokens": 406253688.0, + "step": 16043 + }, + { + "epoch": 1.7619152207335822, + "grad_norm": 2.1051974296569824, + "learning_rate": 1e-06, + "loss": 0.869, + "mean_token_accuracy": 0.7331718802452087, + "num_tokens": 406282045.0, + "step": 16044 + }, + { + "epoch": 1.762025038436196, + "grad_norm": 2.2179903984069824, + "learning_rate": 1e-06, + "loss": 0.94, + "mean_token_accuracy": 0.7054475545883179, + "num_tokens": 406308501.0, + "step": 16045 + }, + { + "epoch": 1.7621348561388097, + "grad_norm": 2.481234312057495, + "learning_rate": 1e-06, + "loss": 0.8153, + "mean_token_accuracy": 0.7382287979125977, + "num_tokens": 406329988.0, + "step": 16046 + }, + { + "epoch": 1.7622446738414232, + "grad_norm": 2.2593770027160645, + "learning_rate": 1e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.718032956123352, + "num_tokens": 406354782.0, + "step": 16047 + }, + { + "epoch": 1.7623544915440368, + "grad_norm": 2.3620803356170654, + "learning_rate": 1e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.7019901871681213, + "num_tokens": 406376098.0, + "step": 16048 + }, + { + "epoch": 1.7624643092466505, + "grad_norm": 2.625453472137451, + "learning_rate": 1e-06, + "loss": 0.8423, + "mean_token_accuracy": 0.7289296388626099, + "num_tokens": 406396371.0, + "step": 16049 + }, + { + "epoch": 1.7625741269492643, + "grad_norm": 2.8465774059295654, + "learning_rate": 1e-06, + "loss": 0.8108, + "mean_token_accuracy": 0.7373226881027222, + "num_tokens": 406413782.0, + "step": 16050 + }, + { + "epoch": 1.7626839446518778, + "grad_norm": 2.354451894760132, + "learning_rate": 1e-06, + "loss": 0.9055, + "mean_token_accuracy": 0.7224582433700562, + "num_tokens": 406435942.0, + "step": 16051 + }, + { + "epoch": 1.7627937623544916, + "grad_norm": 2.2507054805755615, + "learning_rate": 1e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.7236078977584839, + "num_tokens": 406459495.0, + "step": 16052 + }, + { + "epoch": 1.7629035800571051, + "grad_norm": 1.9039132595062256, + "learning_rate": 1e-06, + "loss": 0.8212, + "mean_token_accuracy": 0.7475709319114685, + "num_tokens": 406488483.0, + "step": 16053 + }, + { + "epoch": 1.7630133977597189, + "grad_norm": 2.523488759994507, + "learning_rate": 1e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7029551267623901, + "num_tokens": 406510669.0, + "step": 16054 + }, + { + "epoch": 1.7631232154623326, + "grad_norm": 2.2984557151794434, + "learning_rate": 1e-06, + "loss": 0.875, + "mean_token_accuracy": 0.7193782925605774, + "num_tokens": 406533833.0, + "step": 16055 + }, + { + "epoch": 1.7632330331649462, + "grad_norm": 2.33648681640625, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.7061460018157959, + "num_tokens": 406561312.0, + "step": 16056 + }, + { + "epoch": 1.7633428508675597, + "grad_norm": 2.2021896839141846, + "learning_rate": 1e-06, + "loss": 0.8567, + "mean_token_accuracy": 0.7318866848945618, + "num_tokens": 406587384.0, + "step": 16057 + }, + { + "epoch": 1.7634526685701735, + "grad_norm": 2.05961012840271, + "learning_rate": 1e-06, + "loss": 0.8279, + "mean_token_accuracy": 0.7393192648887634, + "num_tokens": 406616632.0, + "step": 16058 + }, + { + "epoch": 1.7635624862727872, + "grad_norm": 2.1289262771606445, + "learning_rate": 1e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.7175648212432861, + "num_tokens": 406643499.0, + "step": 16059 + }, + { + "epoch": 1.763672303975401, + "grad_norm": 2.212646245956421, + "learning_rate": 1e-06, + "loss": 0.8797, + "mean_token_accuracy": 0.7292624711990356, + "num_tokens": 406669020.0, + "step": 16060 + }, + { + "epoch": 1.7637821216780145, + "grad_norm": 2.3386993408203125, + "learning_rate": 1e-06, + "loss": 0.8976, + "mean_token_accuracy": 0.7242136597633362, + "num_tokens": 406692681.0, + "step": 16061 + }, + { + "epoch": 1.763891939380628, + "grad_norm": 2.3175840377807617, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7085069417953491, + "num_tokens": 406717899.0, + "step": 16062 + }, + { + "epoch": 1.7640017570832418, + "grad_norm": 2.303241729736328, + "learning_rate": 1e-06, + "loss": 0.8615, + "mean_token_accuracy": 0.7302625179290771, + "num_tokens": 406742504.0, + "step": 16063 + }, + { + "epoch": 1.7641115747858556, + "grad_norm": 2.4474854469299316, + "learning_rate": 1e-06, + "loss": 0.904, + "mean_token_accuracy": 0.7142101526260376, + "num_tokens": 406763478.0, + "step": 16064 + }, + { + "epoch": 1.764221392488469, + "grad_norm": 2.2992618083953857, + "learning_rate": 1e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.7285493016242981, + "num_tokens": 406787091.0, + "step": 16065 + }, + { + "epoch": 1.7643312101910829, + "grad_norm": 2.346545696258545, + "learning_rate": 1e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.7197005748748779, + "num_tokens": 406809985.0, + "step": 16066 + }, + { + "epoch": 1.7644410278936964, + "grad_norm": 2.444824457168579, + "learning_rate": 1e-06, + "loss": 0.846, + "mean_token_accuracy": 0.7336935997009277, + "num_tokens": 406832481.0, + "step": 16067 + }, + { + "epoch": 1.7645508455963101, + "grad_norm": 2.3919503688812256, + "learning_rate": 1e-06, + "loss": 0.8926, + "mean_token_accuracy": 0.7219932079315186, + "num_tokens": 406856007.0, + "step": 16068 + }, + { + "epoch": 1.764660663298924, + "grad_norm": 1.9704159498214722, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.6970296502113342, + "num_tokens": 406887067.0, + "step": 16069 + }, + { + "epoch": 1.7647704810015374, + "grad_norm": 2.5584073066711426, + "learning_rate": 1e-06, + "loss": 0.8174, + "mean_token_accuracy": 0.7372996211051941, + "num_tokens": 406907291.0, + "step": 16070 + }, + { + "epoch": 1.764880298704151, + "grad_norm": 2.360483407974243, + "learning_rate": 1e-06, + "loss": 0.8893, + "mean_token_accuracy": 0.7172894477844238, + "num_tokens": 406930447.0, + "step": 16071 + }, + { + "epoch": 1.7649901164067647, + "grad_norm": 2.234212636947632, + "learning_rate": 1e-06, + "loss": 0.881, + "mean_token_accuracy": 0.7227657437324524, + "num_tokens": 406958572.0, + "step": 16072 + }, + { + "epoch": 1.7650999341093785, + "grad_norm": 2.7761645317077637, + "learning_rate": 1e-06, + "loss": 0.8758, + "mean_token_accuracy": 0.7206238508224487, + "num_tokens": 406975495.0, + "step": 16073 + }, + { + "epoch": 1.7652097518119922, + "grad_norm": 2.3843984603881836, + "learning_rate": 1e-06, + "loss": 0.9272, + "mean_token_accuracy": 0.7231173515319824, + "num_tokens": 406999328.0, + "step": 16074 + }, + { + "epoch": 1.7653195695146058, + "grad_norm": 2.1467182636260986, + "learning_rate": 1e-06, + "loss": 0.9485, + "mean_token_accuracy": 0.7046197652816772, + "num_tokens": 407027359.0, + "step": 16075 + }, + { + "epoch": 1.7654293872172193, + "grad_norm": 2.6199429035186768, + "learning_rate": 1e-06, + "loss": 0.8806, + "mean_token_accuracy": 0.7298944592475891, + "num_tokens": 407047763.0, + "step": 16076 + }, + { + "epoch": 1.765539204919833, + "grad_norm": 2.161162853240967, + "learning_rate": 1e-06, + "loss": 0.8662, + "mean_token_accuracy": 0.7335177659988403, + "num_tokens": 407074724.0, + "step": 16077 + }, + { + "epoch": 1.7656490226224468, + "grad_norm": 2.1610474586486816, + "learning_rate": 1e-06, + "loss": 0.8313, + "mean_token_accuracy": 0.7391022443771362, + "num_tokens": 407102292.0, + "step": 16078 + }, + { + "epoch": 1.7657588403250604, + "grad_norm": 2.31636381149292, + "learning_rate": 1e-06, + "loss": 0.8044, + "mean_token_accuracy": 0.7452445030212402, + "num_tokens": 407124955.0, + "step": 16079 + }, + { + "epoch": 1.765868658027674, + "grad_norm": 2.390320301055908, + "learning_rate": 1e-06, + "loss": 0.8769, + "mean_token_accuracy": 0.7294619083404541, + "num_tokens": 407148191.0, + "step": 16080 + }, + { + "epoch": 1.7659784757302877, + "grad_norm": 2.3128409385681152, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7061930894851685, + "num_tokens": 407173435.0, + "step": 16081 + }, + { + "epoch": 1.7660882934329014, + "grad_norm": 2.887810468673706, + "learning_rate": 1e-06, + "loss": 0.8976, + "mean_token_accuracy": 0.7174474596977234, + "num_tokens": 407189846.0, + "step": 16082 + }, + { + "epoch": 1.7661981111355152, + "grad_norm": 2.1649773120880127, + "learning_rate": 1e-06, + "loss": 0.9103, + "mean_token_accuracy": 0.7104604840278625, + "num_tokens": 407216249.0, + "step": 16083 + }, + { + "epoch": 1.7663079288381287, + "grad_norm": 2.404866933822632, + "learning_rate": 1e-06, + "loss": 0.8465, + "mean_token_accuracy": 0.7346601486206055, + "num_tokens": 407237738.0, + "step": 16084 + }, + { + "epoch": 1.7664177465407422, + "grad_norm": 2.2434232234954834, + "learning_rate": 1e-06, + "loss": 0.8727, + "mean_token_accuracy": 0.7326566576957703, + "num_tokens": 407261999.0, + "step": 16085 + }, + { + "epoch": 1.766527564243356, + "grad_norm": 2.0427346229553223, + "learning_rate": 1e-06, + "loss": 0.9588, + "mean_token_accuracy": 0.7003654837608337, + "num_tokens": 407293444.0, + "step": 16086 + }, + { + "epoch": 1.7666373819459698, + "grad_norm": 2.282602548599243, + "learning_rate": 1e-06, + "loss": 0.9122, + "mean_token_accuracy": 0.71730637550354, + "num_tokens": 407318732.0, + "step": 16087 + }, + { + "epoch": 1.7667471996485835, + "grad_norm": 2.021599054336548, + "learning_rate": 1e-06, + "loss": 1.0012, + "mean_token_accuracy": 0.6899511218070984, + "num_tokens": 407350103.0, + "step": 16088 + }, + { + "epoch": 1.766857017351197, + "grad_norm": 2.156418800354004, + "learning_rate": 1e-06, + "loss": 0.8871, + "mean_token_accuracy": 0.7179080247879028, + "num_tokens": 407375348.0, + "step": 16089 + }, + { + "epoch": 1.7669668350538106, + "grad_norm": 2.072579860687256, + "learning_rate": 1e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.7271432280540466, + "num_tokens": 407403634.0, + "step": 16090 + }, + { + "epoch": 1.7670766527564243, + "grad_norm": 2.2805309295654297, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.714950680732727, + "num_tokens": 407428394.0, + "step": 16091 + }, + { + "epoch": 1.767186470459038, + "grad_norm": 2.1405904293060303, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7132925987243652, + "num_tokens": 407455408.0, + "step": 16092 + }, + { + "epoch": 1.7672962881616516, + "grad_norm": 2.1376149654388428, + "learning_rate": 1e-06, + "loss": 0.9107, + "mean_token_accuracy": 0.7127372026443481, + "num_tokens": 407483063.0, + "step": 16093 + }, + { + "epoch": 1.7674061058642652, + "grad_norm": 2.157644748687744, + "learning_rate": 1e-06, + "loss": 0.8789, + "mean_token_accuracy": 0.7276451587677002, + "num_tokens": 407509052.0, + "step": 16094 + }, + { + "epoch": 1.767515923566879, + "grad_norm": 2.3085193634033203, + "learning_rate": 1e-06, + "loss": 0.8951, + "mean_token_accuracy": 0.7237791419029236, + "num_tokens": 407533567.0, + "step": 16095 + }, + { + "epoch": 1.7676257412694927, + "grad_norm": 2.254845380783081, + "learning_rate": 1e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.7141842842102051, + "num_tokens": 407558090.0, + "step": 16096 + }, + { + "epoch": 1.7677355589721064, + "grad_norm": 2.0532801151275635, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7164246439933777, + "num_tokens": 407586707.0, + "step": 16097 + }, + { + "epoch": 1.76784537667472, + "grad_norm": 2.066293478012085, + "learning_rate": 1e-06, + "loss": 0.8514, + "mean_token_accuracy": 0.7302322387695312, + "num_tokens": 407614003.0, + "step": 16098 + }, + { + "epoch": 1.7679551943773335, + "grad_norm": 2.2658143043518066, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7097790241241455, + "num_tokens": 407638674.0, + "step": 16099 + }, + { + "epoch": 1.7680650120799473, + "grad_norm": 2.2635600566864014, + "learning_rate": 1e-06, + "loss": 0.8407, + "mean_token_accuracy": 0.733478844165802, + "num_tokens": 407664692.0, + "step": 16100 + }, + { + "epoch": 1.768174829782561, + "grad_norm": 2.4666006565093994, + "learning_rate": 1e-06, + "loss": 0.9189, + "mean_token_accuracy": 0.7204228639602661, + "num_tokens": 407685692.0, + "step": 16101 + }, + { + "epoch": 1.7682846474851748, + "grad_norm": 2.0656492710113525, + "learning_rate": 1e-06, + "loss": 0.8975, + "mean_token_accuracy": 0.7227101922035217, + "num_tokens": 407714127.0, + "step": 16102 + }, + { + "epoch": 1.7683944651877883, + "grad_norm": 2.071695566177368, + "learning_rate": 1e-06, + "loss": 0.939, + "mean_token_accuracy": 0.7052687406539917, + "num_tokens": 407743872.0, + "step": 16103 + }, + { + "epoch": 1.7685042828904018, + "grad_norm": 2.312110662460327, + "learning_rate": 1e-06, + "loss": 0.8843, + "mean_token_accuracy": 0.7288243770599365, + "num_tokens": 407768208.0, + "step": 16104 + }, + { + "epoch": 1.7686141005930156, + "grad_norm": 2.340322256088257, + "learning_rate": 1e-06, + "loss": 0.8697, + "mean_token_accuracy": 0.7284529209136963, + "num_tokens": 407792219.0, + "step": 16105 + }, + { + "epoch": 1.7687239182956294, + "grad_norm": 2.251673698425293, + "learning_rate": 1e-06, + "loss": 0.8691, + "mean_token_accuracy": 0.7254594564437866, + "num_tokens": 407817799.0, + "step": 16106 + }, + { + "epoch": 1.768833735998243, + "grad_norm": 2.3604815006256104, + "learning_rate": 1e-06, + "loss": 0.998, + "mean_token_accuracy": 0.6948732137680054, + "num_tokens": 407842612.0, + "step": 16107 + }, + { + "epoch": 1.7689435537008564, + "grad_norm": 2.22268009185791, + "learning_rate": 1e-06, + "loss": 0.8734, + "mean_token_accuracy": 0.7273872494697571, + "num_tokens": 407868091.0, + "step": 16108 + }, + { + "epoch": 1.7690533714034702, + "grad_norm": 2.687735080718994, + "learning_rate": 1e-06, + "loss": 0.8505, + "mean_token_accuracy": 0.7318414449691772, + "num_tokens": 407887081.0, + "step": 16109 + }, + { + "epoch": 1.769163189106084, + "grad_norm": 2.1809933185577393, + "learning_rate": 1e-06, + "loss": 0.9635, + "mean_token_accuracy": 0.6999625563621521, + "num_tokens": 407913831.0, + "step": 16110 + }, + { + "epoch": 1.7692730068086977, + "grad_norm": 2.200617551803589, + "learning_rate": 1e-06, + "loss": 0.9096, + "mean_token_accuracy": 0.7156060934066772, + "num_tokens": 407940148.0, + "step": 16111 + }, + { + "epoch": 1.7693828245113112, + "grad_norm": 2.057201385498047, + "learning_rate": 1e-06, + "loss": 0.8762, + "mean_token_accuracy": 0.7239285707473755, + "num_tokens": 407966878.0, + "step": 16112 + }, + { + "epoch": 1.7694926422139248, + "grad_norm": 2.446986436843872, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7109346389770508, + "num_tokens": 407988893.0, + "step": 16113 + }, + { + "epoch": 1.7696024599165385, + "grad_norm": 2.43740177154541, + "learning_rate": 1e-06, + "loss": 0.9474, + "mean_token_accuracy": 0.7070571184158325, + "num_tokens": 408010842.0, + "step": 16114 + }, + { + "epoch": 1.7697122776191523, + "grad_norm": 2.270477771759033, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.7100957632064819, + "num_tokens": 408037139.0, + "step": 16115 + }, + { + "epoch": 1.7698220953217658, + "grad_norm": 2.5111794471740723, + "learning_rate": 1e-06, + "loss": 0.845, + "mean_token_accuracy": 0.740587592124939, + "num_tokens": 408058093.0, + "step": 16116 + }, + { + "epoch": 1.7699319130243796, + "grad_norm": 2.407996416091919, + "learning_rate": 1e-06, + "loss": 0.8412, + "mean_token_accuracy": 0.7305554747581482, + "num_tokens": 408080037.0, + "step": 16117 + }, + { + "epoch": 1.7700417307269931, + "grad_norm": 2.2517549991607666, + "learning_rate": 1e-06, + "loss": 0.9234, + "mean_token_accuracy": 0.7151980996131897, + "num_tokens": 408104407.0, + "step": 16118 + }, + { + "epoch": 1.7701515484296069, + "grad_norm": 2.183382987976074, + "learning_rate": 1e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.7177777290344238, + "num_tokens": 408131489.0, + "step": 16119 + }, + { + "epoch": 1.7702613661322206, + "grad_norm": 2.145209550857544, + "learning_rate": 1e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.7137544751167297, + "num_tokens": 408159571.0, + "step": 16120 + }, + { + "epoch": 1.7703711838348342, + "grad_norm": 2.404712677001953, + "learning_rate": 1e-06, + "loss": 0.8786, + "mean_token_accuracy": 0.7227106094360352, + "num_tokens": 408182208.0, + "step": 16121 + }, + { + "epoch": 1.7704810015374477, + "grad_norm": 2.269922971725464, + "learning_rate": 1e-06, + "loss": 0.8756, + "mean_token_accuracy": 0.7418913841247559, + "num_tokens": 408207076.0, + "step": 16122 + }, + { + "epoch": 1.7705908192400615, + "grad_norm": 2.6594367027282715, + "learning_rate": 1e-06, + "loss": 0.787, + "mean_token_accuracy": 0.7511887550354004, + "num_tokens": 408226242.0, + "step": 16123 + }, + { + "epoch": 1.7707006369426752, + "grad_norm": 2.4899046421051025, + "learning_rate": 1e-06, + "loss": 0.8509, + "mean_token_accuracy": 0.7320536375045776, + "num_tokens": 408246736.0, + "step": 16124 + }, + { + "epoch": 1.770810454645289, + "grad_norm": 2.165243148803711, + "learning_rate": 1e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.7204164266586304, + "num_tokens": 408275409.0, + "step": 16125 + }, + { + "epoch": 1.7709202723479025, + "grad_norm": 1.9013315439224243, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7175395488739014, + "num_tokens": 408309892.0, + "step": 16126 + }, + { + "epoch": 1.771030090050516, + "grad_norm": 2.061049699783325, + "learning_rate": 1e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.7156108617782593, + "num_tokens": 408339033.0, + "step": 16127 + }, + { + "epoch": 1.7711399077531298, + "grad_norm": 2.0099432468414307, + "learning_rate": 1e-06, + "loss": 0.9858, + "mean_token_accuracy": 0.6952439546585083, + "num_tokens": 408369237.0, + "step": 16128 + }, + { + "epoch": 1.7712497254557436, + "grad_norm": 2.3042843341827393, + "learning_rate": 1e-06, + "loss": 0.8192, + "mean_token_accuracy": 0.7373200058937073, + "num_tokens": 408392900.0, + "step": 16129 + }, + { + "epoch": 1.771359543158357, + "grad_norm": 2.0726709365844727, + "learning_rate": 1e-06, + "loss": 0.9025, + "mean_token_accuracy": 0.7180675268173218, + "num_tokens": 408422739.0, + "step": 16130 + }, + { + "epoch": 1.7714693608609708, + "grad_norm": 2.274894952774048, + "learning_rate": 1e-06, + "loss": 0.8678, + "mean_token_accuracy": 0.7295697927474976, + "num_tokens": 408447105.0, + "step": 16131 + }, + { + "epoch": 1.7715791785635844, + "grad_norm": 2.2399330139160156, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7116045951843262, + "num_tokens": 408473378.0, + "step": 16132 + }, + { + "epoch": 1.7716889962661981, + "grad_norm": 2.4483180046081543, + "learning_rate": 1e-06, + "loss": 0.8665, + "mean_token_accuracy": 0.731928288936615, + "num_tokens": 408493943.0, + "step": 16133 + }, + { + "epoch": 1.771798813968812, + "grad_norm": 2.176304578781128, + "learning_rate": 1e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.7239546775817871, + "num_tokens": 408520995.0, + "step": 16134 + }, + { + "epoch": 1.7719086316714254, + "grad_norm": 2.315518617630005, + "learning_rate": 1e-06, + "loss": 0.8189, + "mean_token_accuracy": 0.7392233610153198, + "num_tokens": 408546162.0, + "step": 16135 + }, + { + "epoch": 1.772018449374039, + "grad_norm": 2.3777966499328613, + "learning_rate": 1e-06, + "loss": 0.8891, + "mean_token_accuracy": 0.7208165526390076, + "num_tokens": 408568798.0, + "step": 16136 + }, + { + "epoch": 1.7721282670766527, + "grad_norm": 2.583155632019043, + "learning_rate": 1e-06, + "loss": 0.8575, + "mean_token_accuracy": 0.7295646667480469, + "num_tokens": 408589359.0, + "step": 16137 + }, + { + "epoch": 1.7722380847792665, + "grad_norm": 2.220879316329956, + "learning_rate": 1e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.7140326499938965, + "num_tokens": 408616036.0, + "step": 16138 + }, + { + "epoch": 1.7723479024818802, + "grad_norm": 2.443439245223999, + "learning_rate": 1e-06, + "loss": 0.8069, + "mean_token_accuracy": 0.7432256937026978, + "num_tokens": 408638796.0, + "step": 16139 + }, + { + "epoch": 1.7724577201844938, + "grad_norm": 2.3432626724243164, + "learning_rate": 1e-06, + "loss": 0.8317, + "mean_token_accuracy": 0.735055685043335, + "num_tokens": 408661981.0, + "step": 16140 + }, + { + "epoch": 1.7725675378871073, + "grad_norm": 2.251797914505005, + "learning_rate": 1e-06, + "loss": 0.8436, + "mean_token_accuracy": 0.7354350090026855, + "num_tokens": 408685945.0, + "step": 16141 + }, + { + "epoch": 1.772677355589721, + "grad_norm": 2.6933753490448, + "learning_rate": 1e-06, + "loss": 0.8179, + "mean_token_accuracy": 0.7368184924125671, + "num_tokens": 408703283.0, + "step": 16142 + }, + { + "epoch": 1.7727871732923348, + "grad_norm": 2.0675675868988037, + "learning_rate": 1e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.7084953784942627, + "num_tokens": 408731031.0, + "step": 16143 + }, + { + "epoch": 1.7728969909949484, + "grad_norm": 2.3428220748901367, + "learning_rate": 1e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7169216871261597, + "num_tokens": 408754401.0, + "step": 16144 + }, + { + "epoch": 1.7730068086975619, + "grad_norm": 2.106273889541626, + "learning_rate": 1e-06, + "loss": 0.8513, + "mean_token_accuracy": 0.730894923210144, + "num_tokens": 408782107.0, + "step": 16145 + }, + { + "epoch": 1.7731166264001756, + "grad_norm": 2.4082348346710205, + "learning_rate": 1e-06, + "loss": 0.8435, + "mean_token_accuracy": 0.7433854341506958, + "num_tokens": 408804477.0, + "step": 16146 + }, + { + "epoch": 1.7732264441027894, + "grad_norm": 2.2776551246643066, + "learning_rate": 1e-06, + "loss": 0.9544, + "mean_token_accuracy": 0.7205995917320251, + "num_tokens": 408829883.0, + "step": 16147 + }, + { + "epoch": 1.7733362618054032, + "grad_norm": 1.9952133893966675, + "learning_rate": 1e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.7000674605369568, + "num_tokens": 408861780.0, + "step": 16148 + }, + { + "epoch": 1.7734460795080167, + "grad_norm": 1.9689215421676636, + "learning_rate": 1e-06, + "loss": 0.9529, + "mean_token_accuracy": 0.7085680365562439, + "num_tokens": 408895384.0, + "step": 16149 + }, + { + "epoch": 1.7735558972106302, + "grad_norm": 2.591592788696289, + "learning_rate": 1e-06, + "loss": 0.8468, + "mean_token_accuracy": 0.7341139316558838, + "num_tokens": 408915765.0, + "step": 16150 + }, + { + "epoch": 1.773665714913244, + "grad_norm": 2.0704681873321533, + "learning_rate": 1e-06, + "loss": 0.8511, + "mean_token_accuracy": 0.7329773902893066, + "num_tokens": 408944680.0, + "step": 16151 + }, + { + "epoch": 1.7737755326158577, + "grad_norm": 2.0428483486175537, + "learning_rate": 1e-06, + "loss": 0.8429, + "mean_token_accuracy": 0.7342293858528137, + "num_tokens": 408972965.0, + "step": 16152 + }, + { + "epoch": 1.7738853503184715, + "grad_norm": 2.4272379875183105, + "learning_rate": 1e-06, + "loss": 0.9195, + "mean_token_accuracy": 0.7124184966087341, + "num_tokens": 408996986.0, + "step": 16153 + }, + { + "epoch": 1.773995168021085, + "grad_norm": 2.2923805713653564, + "learning_rate": 1e-06, + "loss": 0.8733, + "mean_token_accuracy": 0.7252335548400879, + "num_tokens": 409020333.0, + "step": 16154 + }, + { + "epoch": 1.7741049857236986, + "grad_norm": 2.0295333862304688, + "learning_rate": 1e-06, + "loss": 1.0103, + "mean_token_accuracy": 0.6876941919326782, + "num_tokens": 409053387.0, + "step": 16155 + }, + { + "epoch": 1.7742148034263123, + "grad_norm": 2.2965164184570312, + "learning_rate": 1e-06, + "loss": 0.9473, + "mean_token_accuracy": 0.7058712244033813, + "num_tokens": 409078728.0, + "step": 16156 + }, + { + "epoch": 1.774324621128926, + "grad_norm": 2.2493600845336914, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7092969417572021, + "num_tokens": 409102887.0, + "step": 16157 + }, + { + "epoch": 1.7744344388315396, + "grad_norm": 2.0099987983703613, + "learning_rate": 1e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.7229884266853333, + "num_tokens": 409134094.0, + "step": 16158 + }, + { + "epoch": 1.7745442565341532, + "grad_norm": 2.0465481281280518, + "learning_rate": 1e-06, + "loss": 0.8901, + "mean_token_accuracy": 0.7250944375991821, + "num_tokens": 409165373.0, + "step": 16159 + }, + { + "epoch": 1.774654074236767, + "grad_norm": 2.3084917068481445, + "learning_rate": 1e-06, + "loss": 0.864, + "mean_token_accuracy": 0.7315105199813843, + "num_tokens": 409189557.0, + "step": 16160 + }, + { + "epoch": 1.7747638919393807, + "grad_norm": 1.952951431274414, + "learning_rate": 1e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.717157244682312, + "num_tokens": 409223035.0, + "step": 16161 + }, + { + "epoch": 1.7748737096419944, + "grad_norm": 2.5919625759124756, + "learning_rate": 1e-06, + "loss": 0.8007, + "mean_token_accuracy": 0.7465630769729614, + "num_tokens": 409241939.0, + "step": 16162 + }, + { + "epoch": 1.774983527344608, + "grad_norm": 2.109457015991211, + "learning_rate": 1e-06, + "loss": 0.9048, + "mean_token_accuracy": 0.7189408540725708, + "num_tokens": 409270666.0, + "step": 16163 + }, + { + "epoch": 1.7750933450472215, + "grad_norm": 2.267263650894165, + "learning_rate": 1e-06, + "loss": 0.9082, + "mean_token_accuracy": 0.7319226264953613, + "num_tokens": 409295558.0, + "step": 16164 + }, + { + "epoch": 1.7752031627498353, + "grad_norm": 1.7671852111816406, + "learning_rate": 1e-06, + "loss": 0.9182, + "mean_token_accuracy": 0.7123757600784302, + "num_tokens": 409331410.0, + "step": 16165 + }, + { + "epoch": 1.775312980452449, + "grad_norm": 2.386082649230957, + "learning_rate": 1e-06, + "loss": 0.9541, + "mean_token_accuracy": 0.7095391154289246, + "num_tokens": 409356347.0, + "step": 16166 + }, + { + "epoch": 1.7754227981550625, + "grad_norm": 2.553257703781128, + "learning_rate": 1e-06, + "loss": 0.861, + "mean_token_accuracy": 0.7342643141746521, + "num_tokens": 409376335.0, + "step": 16167 + }, + { + "epoch": 1.7755326158576763, + "grad_norm": 2.4222402572631836, + "learning_rate": 1e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.708949089050293, + "num_tokens": 409401219.0, + "step": 16168 + }, + { + "epoch": 1.7756424335602898, + "grad_norm": 2.263760566711426, + "learning_rate": 1e-06, + "loss": 0.8456, + "mean_token_accuracy": 0.746381402015686, + "num_tokens": 409424913.0, + "step": 16169 + }, + { + "epoch": 1.7757522512629036, + "grad_norm": 2.279782295227051, + "learning_rate": 1e-06, + "loss": 0.8967, + "mean_token_accuracy": 0.72209632396698, + "num_tokens": 409449336.0, + "step": 16170 + }, + { + "epoch": 1.7758620689655173, + "grad_norm": 2.142252206802368, + "learning_rate": 1e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.7019776105880737, + "num_tokens": 409479048.0, + "step": 16171 + }, + { + "epoch": 1.7759718866681309, + "grad_norm": 2.048725128173828, + "learning_rate": 1e-06, + "loss": 0.8942, + "mean_token_accuracy": 0.7251577377319336, + "num_tokens": 409507696.0, + "step": 16172 + }, + { + "epoch": 1.7760817043707444, + "grad_norm": 2.348294973373413, + "learning_rate": 1e-06, + "loss": 0.8617, + "mean_token_accuracy": 0.7302171587944031, + "num_tokens": 409529873.0, + "step": 16173 + }, + { + "epoch": 1.7761915220733582, + "grad_norm": 2.144582748413086, + "learning_rate": 1e-06, + "loss": 0.9565, + "mean_token_accuracy": 0.7024344205856323, + "num_tokens": 409557019.0, + "step": 16174 + }, + { + "epoch": 1.776301339775972, + "grad_norm": 2.4717001914978027, + "learning_rate": 1e-06, + "loss": 0.8263, + "mean_token_accuracy": 0.7401686906814575, + "num_tokens": 409578418.0, + "step": 16175 + }, + { + "epoch": 1.7764111574785857, + "grad_norm": 2.27524995803833, + "learning_rate": 1e-06, + "loss": 0.8437, + "mean_token_accuracy": 0.7302474975585938, + "num_tokens": 409602736.0, + "step": 16176 + }, + { + "epoch": 1.7765209751811992, + "grad_norm": 2.2056305408477783, + "learning_rate": 1e-06, + "loss": 0.9595, + "mean_token_accuracy": 0.701923131942749, + "num_tokens": 409630374.0, + "step": 16177 + }, + { + "epoch": 1.7766307928838128, + "grad_norm": 2.3257083892822266, + "learning_rate": 1e-06, + "loss": 0.8711, + "mean_token_accuracy": 0.7235255837440491, + "num_tokens": 409653686.0, + "step": 16178 + }, + { + "epoch": 1.7767406105864265, + "grad_norm": 2.209738254547119, + "learning_rate": 1e-06, + "loss": 0.994, + "mean_token_accuracy": 0.6969645023345947, + "num_tokens": 409681088.0, + "step": 16179 + }, + { + "epoch": 1.7768504282890403, + "grad_norm": 2.200040340423584, + "learning_rate": 1e-06, + "loss": 0.8325, + "mean_token_accuracy": 0.7367807030677795, + "num_tokens": 409706832.0, + "step": 16180 + }, + { + "epoch": 1.7769602459916538, + "grad_norm": 2.2702994346618652, + "learning_rate": 1e-06, + "loss": 0.912, + "mean_token_accuracy": 0.7154710292816162, + "num_tokens": 409732007.0, + "step": 16181 + }, + { + "epoch": 1.7770700636942676, + "grad_norm": 2.190061569213867, + "learning_rate": 1e-06, + "loss": 0.8846, + "mean_token_accuracy": 0.7242346405982971, + "num_tokens": 409758331.0, + "step": 16182 + }, + { + "epoch": 1.777179881396881, + "grad_norm": 2.073230504989624, + "learning_rate": 1e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7110397815704346, + "num_tokens": 409786312.0, + "step": 16183 + }, + { + "epoch": 1.7772896990994949, + "grad_norm": 2.220283269882202, + "learning_rate": 1e-06, + "loss": 0.8919, + "mean_token_accuracy": 0.7169623374938965, + "num_tokens": 409811050.0, + "step": 16184 + }, + { + "epoch": 1.7773995168021086, + "grad_norm": 1.8981112241744995, + "learning_rate": 1e-06, + "loss": 0.9541, + "mean_token_accuracy": 0.7024197578430176, + "num_tokens": 409846806.0, + "step": 16185 + }, + { + "epoch": 1.7775093345047221, + "grad_norm": 2.0194649696350098, + "learning_rate": 1e-06, + "loss": 0.9444, + "mean_token_accuracy": 0.7090931534767151, + "num_tokens": 409876250.0, + "step": 16186 + }, + { + "epoch": 1.7776191522073357, + "grad_norm": 2.063436985015869, + "learning_rate": 1e-06, + "loss": 0.8253, + "mean_token_accuracy": 0.7318499088287354, + "num_tokens": 409903482.0, + "step": 16187 + }, + { + "epoch": 1.7777289699099494, + "grad_norm": 2.720846652984619, + "learning_rate": 1e-06, + "loss": 0.8088, + "mean_token_accuracy": 0.7447065711021423, + "num_tokens": 409920649.0, + "step": 16188 + }, + { + "epoch": 1.7778387876125632, + "grad_norm": 2.46482253074646, + "learning_rate": 1e-06, + "loss": 0.893, + "mean_token_accuracy": 0.7239432334899902, + "num_tokens": 409941070.0, + "step": 16189 + }, + { + "epoch": 1.777948605315177, + "grad_norm": 2.361281394958496, + "learning_rate": 1e-06, + "loss": 0.8936, + "mean_token_accuracy": 0.7221229076385498, + "num_tokens": 409963400.0, + "step": 16190 + }, + { + "epoch": 1.7780584230177905, + "grad_norm": 2.2120635509490967, + "learning_rate": 1e-06, + "loss": 0.8848, + "mean_token_accuracy": 0.7349938750267029, + "num_tokens": 409992096.0, + "step": 16191 + }, + { + "epoch": 1.778168240720404, + "grad_norm": 2.4414608478546143, + "learning_rate": 1e-06, + "loss": 0.8798, + "mean_token_accuracy": 0.7267633080482483, + "num_tokens": 410014563.0, + "step": 16192 + }, + { + "epoch": 1.7782780584230178, + "grad_norm": 2.189429759979248, + "learning_rate": 1e-06, + "loss": 0.99, + "mean_token_accuracy": 0.6956824064254761, + "num_tokens": 410043166.0, + "step": 16193 + }, + { + "epoch": 1.7783878761256315, + "grad_norm": 2.2194416522979736, + "learning_rate": 1e-06, + "loss": 0.9352, + "mean_token_accuracy": 0.7103134393692017, + "num_tokens": 410068468.0, + "step": 16194 + }, + { + "epoch": 1.778497693828245, + "grad_norm": 2.1509454250335693, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7056969404220581, + "num_tokens": 410094914.0, + "step": 16195 + }, + { + "epoch": 1.7786075115308586, + "grad_norm": 2.1228861808776855, + "learning_rate": 1e-06, + "loss": 0.8231, + "mean_token_accuracy": 0.7363564968109131, + "num_tokens": 410120426.0, + "step": 16196 + }, + { + "epoch": 1.7787173292334724, + "grad_norm": 2.211483955383301, + "learning_rate": 1e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.7002558708190918, + "num_tokens": 410148227.0, + "step": 16197 + }, + { + "epoch": 1.7788271469360861, + "grad_norm": 2.3252012729644775, + "learning_rate": 1e-06, + "loss": 0.8614, + "mean_token_accuracy": 0.7262872457504272, + "num_tokens": 410172071.0, + "step": 16198 + }, + { + "epoch": 1.7789369646386999, + "grad_norm": 2.1703639030456543, + "learning_rate": 1e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.6999656558036804, + "num_tokens": 410200606.0, + "step": 16199 + }, + { + "epoch": 1.7790467823413134, + "grad_norm": 2.2533066272735596, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.709641695022583, + "num_tokens": 410224828.0, + "step": 16200 + }, + { + "epoch": 1.779156600043927, + "grad_norm": 2.600605010986328, + "learning_rate": 1e-06, + "loss": 0.8208, + "mean_token_accuracy": 0.7380220890045166, + "num_tokens": 410243426.0, + "step": 16201 + }, + { + "epoch": 1.7792664177465407, + "grad_norm": 2.45996356010437, + "learning_rate": 1e-06, + "loss": 0.8299, + "mean_token_accuracy": 0.7342922687530518, + "num_tokens": 410264635.0, + "step": 16202 + }, + { + "epoch": 1.7793762354491545, + "grad_norm": 2.2728054523468018, + "learning_rate": 1e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.714340090751648, + "num_tokens": 410290394.0, + "step": 16203 + }, + { + "epoch": 1.7794860531517682, + "grad_norm": 2.206270694732666, + "learning_rate": 1e-06, + "loss": 0.8806, + "mean_token_accuracy": 0.7215409874916077, + "num_tokens": 410315016.0, + "step": 16204 + }, + { + "epoch": 1.7795958708543818, + "grad_norm": 2.005209445953369, + "learning_rate": 1e-06, + "loss": 0.8805, + "mean_token_accuracy": 0.7264933586120605, + "num_tokens": 410346081.0, + "step": 16205 + }, + { + "epoch": 1.7797056885569953, + "grad_norm": 2.2057864665985107, + "learning_rate": 1e-06, + "loss": 0.8949, + "mean_token_accuracy": 0.7288199067115784, + "num_tokens": 410371411.0, + "step": 16206 + }, + { + "epoch": 1.779815506259609, + "grad_norm": 2.0833680629730225, + "learning_rate": 1e-06, + "loss": 0.9202, + "mean_token_accuracy": 0.707931637763977, + "num_tokens": 410401162.0, + "step": 16207 + }, + { + "epoch": 1.7799253239622228, + "grad_norm": 2.493351459503174, + "learning_rate": 1e-06, + "loss": 0.8931, + "mean_token_accuracy": 0.7187274694442749, + "num_tokens": 410423612.0, + "step": 16208 + }, + { + "epoch": 1.7800351416648363, + "grad_norm": 2.383880376815796, + "learning_rate": 1e-06, + "loss": 0.7817, + "mean_token_accuracy": 0.7574681639671326, + "num_tokens": 410446279.0, + "step": 16209 + }, + { + "epoch": 1.7801449593674499, + "grad_norm": 2.3394274711608887, + "learning_rate": 1e-06, + "loss": 0.9629, + "mean_token_accuracy": 0.7164023518562317, + "num_tokens": 410470543.0, + "step": 16210 + }, + { + "epoch": 1.7802547770700636, + "grad_norm": 2.3141531944274902, + "learning_rate": 1e-06, + "loss": 0.8707, + "mean_token_accuracy": 0.7240102291107178, + "num_tokens": 410497199.0, + "step": 16211 + }, + { + "epoch": 1.7803645947726774, + "grad_norm": 2.3835647106170654, + "learning_rate": 1e-06, + "loss": 0.7955, + "mean_token_accuracy": 0.752905011177063, + "num_tokens": 410518554.0, + "step": 16212 + }, + { + "epoch": 1.7804744124752911, + "grad_norm": 2.251131057739258, + "learning_rate": 1e-06, + "loss": 0.9131, + "mean_token_accuracy": 0.7174333333969116, + "num_tokens": 410542487.0, + "step": 16213 + }, + { + "epoch": 1.7805842301779047, + "grad_norm": 2.0412909984588623, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7027727365493774, + "num_tokens": 410574396.0, + "step": 16214 + }, + { + "epoch": 1.7806940478805182, + "grad_norm": 2.357679843902588, + "learning_rate": 1e-06, + "loss": 0.9266, + "mean_token_accuracy": 0.7140659689903259, + "num_tokens": 410599195.0, + "step": 16215 + }, + { + "epoch": 1.780803865583132, + "grad_norm": 2.8135454654693604, + "learning_rate": 1e-06, + "loss": 0.8244, + "mean_token_accuracy": 0.7370097637176514, + "num_tokens": 410617196.0, + "step": 16216 + }, + { + "epoch": 1.7809136832857457, + "grad_norm": 2.3802883625030518, + "learning_rate": 1e-06, + "loss": 0.9237, + "mean_token_accuracy": 0.7173383831977844, + "num_tokens": 410639720.0, + "step": 16217 + }, + { + "epoch": 1.7810235009883595, + "grad_norm": 2.253528594970703, + "learning_rate": 1e-06, + "loss": 0.8585, + "mean_token_accuracy": 0.7314050197601318, + "num_tokens": 410663923.0, + "step": 16218 + }, + { + "epoch": 1.781133318690973, + "grad_norm": 2.12070631980896, + "learning_rate": 1e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.7153871059417725, + "num_tokens": 410693436.0, + "step": 16219 + }, + { + "epoch": 1.7812431363935866, + "grad_norm": 2.1014060974121094, + "learning_rate": 1e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.7021006941795349, + "num_tokens": 410723713.0, + "step": 16220 + }, + { + "epoch": 1.7813529540962003, + "grad_norm": 2.4730799198150635, + "learning_rate": 1e-06, + "loss": 0.8967, + "mean_token_accuracy": 0.7138198614120483, + "num_tokens": 410745121.0, + "step": 16221 + }, + { + "epoch": 1.781462771798814, + "grad_norm": 2.077162504196167, + "learning_rate": 1e-06, + "loss": 1.0192, + "mean_token_accuracy": 0.6898444890975952, + "num_tokens": 410774374.0, + "step": 16222 + }, + { + "epoch": 1.7815725895014276, + "grad_norm": 2.3422741889953613, + "learning_rate": 1e-06, + "loss": 0.8891, + "mean_token_accuracy": 0.720165491104126, + "num_tokens": 410797378.0, + "step": 16223 + }, + { + "epoch": 1.7816824072040411, + "grad_norm": 2.191685676574707, + "learning_rate": 1e-06, + "loss": 0.8404, + "mean_token_accuracy": 0.7316317558288574, + "num_tokens": 410822377.0, + "step": 16224 + }, + { + "epoch": 1.781792224906655, + "grad_norm": 2.2444698810577393, + "learning_rate": 1e-06, + "loss": 0.9697, + "mean_token_accuracy": 0.7023317813873291, + "num_tokens": 410848273.0, + "step": 16225 + }, + { + "epoch": 1.7819020426092687, + "grad_norm": 2.781494617462158, + "learning_rate": 1e-06, + "loss": 0.8452, + "mean_token_accuracy": 0.7310401201248169, + "num_tokens": 410865880.0, + "step": 16226 + }, + { + "epoch": 1.7820118603118824, + "grad_norm": 2.1141245365142822, + "learning_rate": 1e-06, + "loss": 0.9054, + "mean_token_accuracy": 0.7244202494621277, + "num_tokens": 410894256.0, + "step": 16227 + }, + { + "epoch": 1.782121678014496, + "grad_norm": 2.212982416152954, + "learning_rate": 1e-06, + "loss": 0.9192, + "mean_token_accuracy": 0.7117565870285034, + "num_tokens": 410920949.0, + "step": 16228 + }, + { + "epoch": 1.7822314957171095, + "grad_norm": 2.1188559532165527, + "learning_rate": 1e-06, + "loss": 0.8705, + "mean_token_accuracy": 0.735623836517334, + "num_tokens": 410947516.0, + "step": 16229 + }, + { + "epoch": 1.7823413134197232, + "grad_norm": 2.4031965732574463, + "learning_rate": 1e-06, + "loss": 0.8344, + "mean_token_accuracy": 0.7334682941436768, + "num_tokens": 410970756.0, + "step": 16230 + }, + { + "epoch": 1.782451131122337, + "grad_norm": 2.296534538269043, + "learning_rate": 1e-06, + "loss": 0.9034, + "mean_token_accuracy": 0.7227957248687744, + "num_tokens": 410993775.0, + "step": 16231 + }, + { + "epoch": 1.7825609488249505, + "grad_norm": 2.4943439960479736, + "learning_rate": 1e-06, + "loss": 0.9607, + "mean_token_accuracy": 0.7068732976913452, + "num_tokens": 411015732.0, + "step": 16232 + }, + { + "epoch": 1.7826707665275643, + "grad_norm": 2.1839423179626465, + "learning_rate": 1e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.7189784049987793, + "num_tokens": 411041495.0, + "step": 16233 + }, + { + "epoch": 1.7827805842301778, + "grad_norm": 2.477846622467041, + "learning_rate": 1e-06, + "loss": 0.8501, + "mean_token_accuracy": 0.7313171625137329, + "num_tokens": 411064506.0, + "step": 16234 + }, + { + "epoch": 1.7828904019327916, + "grad_norm": 2.2116615772247314, + "learning_rate": 1e-06, + "loss": 0.9237, + "mean_token_accuracy": 0.7205382585525513, + "num_tokens": 411090837.0, + "step": 16235 + }, + { + "epoch": 1.7830002196354053, + "grad_norm": 2.1456713676452637, + "learning_rate": 1e-06, + "loss": 0.9645, + "mean_token_accuracy": 0.7010297775268555, + "num_tokens": 411118945.0, + "step": 16236 + }, + { + "epoch": 1.7831100373380189, + "grad_norm": 2.121356964111328, + "learning_rate": 1e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7137594223022461, + "num_tokens": 411146311.0, + "step": 16237 + }, + { + "epoch": 1.7832198550406324, + "grad_norm": 2.1203038692474365, + "learning_rate": 1e-06, + "loss": 0.8756, + "mean_token_accuracy": 0.726540744304657, + "num_tokens": 411172063.0, + "step": 16238 + }, + { + "epoch": 1.7833296727432462, + "grad_norm": 2.317028760910034, + "learning_rate": 1e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.7040252685546875, + "num_tokens": 411196442.0, + "step": 16239 + }, + { + "epoch": 1.78343949044586, + "grad_norm": 2.1995863914489746, + "learning_rate": 1e-06, + "loss": 1.0031, + "mean_token_accuracy": 0.7079007625579834, + "num_tokens": 411223597.0, + "step": 16240 + }, + { + "epoch": 1.7835493081484737, + "grad_norm": 2.535386085510254, + "learning_rate": 1e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.7170562148094177, + "num_tokens": 411246383.0, + "step": 16241 + }, + { + "epoch": 1.7836591258510872, + "grad_norm": 2.226438522338867, + "learning_rate": 1e-06, + "loss": 0.8248, + "mean_token_accuracy": 0.7344805598258972, + "num_tokens": 411272017.0, + "step": 16242 + }, + { + "epoch": 1.7837689435537007, + "grad_norm": 2.25884747505188, + "learning_rate": 1e-06, + "loss": 0.8385, + "mean_token_accuracy": 0.7348864078521729, + "num_tokens": 411297442.0, + "step": 16243 + }, + { + "epoch": 1.7838787612563145, + "grad_norm": 2.4817984104156494, + "learning_rate": 1e-06, + "loss": 0.881, + "mean_token_accuracy": 0.7219429612159729, + "num_tokens": 411319574.0, + "step": 16244 + }, + { + "epoch": 1.7839885789589283, + "grad_norm": 2.2722084522247314, + "learning_rate": 1e-06, + "loss": 0.8455, + "mean_token_accuracy": 0.7454713582992554, + "num_tokens": 411343394.0, + "step": 16245 + }, + { + "epoch": 1.7840983966615418, + "grad_norm": 2.03952693939209, + "learning_rate": 1e-06, + "loss": 1.0063, + "mean_token_accuracy": 0.7057365775108337, + "num_tokens": 411372953.0, + "step": 16246 + }, + { + "epoch": 1.7842082143641556, + "grad_norm": 2.0724380016326904, + "learning_rate": 1e-06, + "loss": 0.9086, + "mean_token_accuracy": 0.717039942741394, + "num_tokens": 411401676.0, + "step": 16247 + }, + { + "epoch": 1.784318032066769, + "grad_norm": 2.0570969581604004, + "learning_rate": 1e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.7099655866622925, + "num_tokens": 411432944.0, + "step": 16248 + }, + { + "epoch": 1.7844278497693828, + "grad_norm": 2.1791300773620605, + "learning_rate": 1e-06, + "loss": 0.8744, + "mean_token_accuracy": 0.7322696447372437, + "num_tokens": 411461854.0, + "step": 16249 + }, + { + "epoch": 1.7845376674719966, + "grad_norm": 2.5296647548675537, + "learning_rate": 1e-06, + "loss": 0.8878, + "mean_token_accuracy": 0.7275357246398926, + "num_tokens": 411482540.0, + "step": 16250 + }, + { + "epoch": 1.7846474851746101, + "grad_norm": 2.203138589859009, + "learning_rate": 1e-06, + "loss": 0.8791, + "mean_token_accuracy": 0.7353341579437256, + "num_tokens": 411508972.0, + "step": 16251 + }, + { + "epoch": 1.7847573028772237, + "grad_norm": 2.2464940547943115, + "learning_rate": 1e-06, + "loss": 0.8951, + "mean_token_accuracy": 0.7230404615402222, + "num_tokens": 411534396.0, + "step": 16252 + }, + { + "epoch": 1.7848671205798374, + "grad_norm": 2.51570725440979, + "learning_rate": 1e-06, + "loss": 0.8418, + "mean_token_accuracy": 0.7394325733184814, + "num_tokens": 411555487.0, + "step": 16253 + }, + { + "epoch": 1.7849769382824512, + "grad_norm": 2.410102367401123, + "learning_rate": 1e-06, + "loss": 0.8203, + "mean_token_accuracy": 0.7383118867874146, + "num_tokens": 411576591.0, + "step": 16254 + }, + { + "epoch": 1.785086755985065, + "grad_norm": 2.4081788063049316, + "learning_rate": 1e-06, + "loss": 0.7835, + "mean_token_accuracy": 0.7532100081443787, + "num_tokens": 411596975.0, + "step": 16255 + }, + { + "epoch": 1.7851965736876785, + "grad_norm": 2.3352980613708496, + "learning_rate": 1e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7072500586509705, + "num_tokens": 411622298.0, + "step": 16256 + }, + { + "epoch": 1.785306391390292, + "grad_norm": 2.193629503250122, + "learning_rate": 1e-06, + "loss": 0.8518, + "mean_token_accuracy": 0.7309623956680298, + "num_tokens": 411647447.0, + "step": 16257 + }, + { + "epoch": 1.7854162090929058, + "grad_norm": 2.070497989654541, + "learning_rate": 1e-06, + "loss": 0.8622, + "mean_token_accuracy": 0.7330397963523865, + "num_tokens": 411676122.0, + "step": 16258 + }, + { + "epoch": 1.7855260267955195, + "grad_norm": 2.2120752334594727, + "learning_rate": 1e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.7212203741073608, + "num_tokens": 411704136.0, + "step": 16259 + }, + { + "epoch": 1.785635844498133, + "grad_norm": 2.2648632526397705, + "learning_rate": 1e-06, + "loss": 0.8783, + "mean_token_accuracy": 0.7239561080932617, + "num_tokens": 411728823.0, + "step": 16260 + }, + { + "epoch": 1.7857456622007466, + "grad_norm": 2.2738497257232666, + "learning_rate": 1e-06, + "loss": 0.8858, + "mean_token_accuracy": 0.72593092918396, + "num_tokens": 411754202.0, + "step": 16261 + }, + { + "epoch": 1.7858554799033604, + "grad_norm": 2.28814959526062, + "learning_rate": 1e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.7140932679176331, + "num_tokens": 411780683.0, + "step": 16262 + }, + { + "epoch": 1.7859652976059741, + "grad_norm": 2.4148919582366943, + "learning_rate": 1e-06, + "loss": 0.8782, + "mean_token_accuracy": 0.7188753485679626, + "num_tokens": 411802811.0, + "step": 16263 + }, + { + "epoch": 1.7860751153085879, + "grad_norm": 2.3675591945648193, + "learning_rate": 1e-06, + "loss": 0.7702, + "mean_token_accuracy": 0.7552645206451416, + "num_tokens": 411824793.0, + "step": 16264 + }, + { + "epoch": 1.7861849330112014, + "grad_norm": 2.081334114074707, + "learning_rate": 1e-06, + "loss": 0.8525, + "mean_token_accuracy": 0.7330359816551208, + "num_tokens": 411853221.0, + "step": 16265 + }, + { + "epoch": 1.786294750713815, + "grad_norm": 2.5541329383850098, + "learning_rate": 1e-06, + "loss": 0.8764, + "mean_token_accuracy": 0.7350832223892212, + "num_tokens": 411875340.0, + "step": 16266 + }, + { + "epoch": 1.7864045684164287, + "grad_norm": 2.4019553661346436, + "learning_rate": 1e-06, + "loss": 0.879, + "mean_token_accuracy": 0.7260258197784424, + "num_tokens": 411898724.0, + "step": 16267 + }, + { + "epoch": 1.7865143861190425, + "grad_norm": 2.221221685409546, + "learning_rate": 1e-06, + "loss": 0.9327, + "mean_token_accuracy": 0.7124487161636353, + "num_tokens": 411924384.0, + "step": 16268 + }, + { + "epoch": 1.7866242038216562, + "grad_norm": 2.211772918701172, + "learning_rate": 1e-06, + "loss": 0.9054, + "mean_token_accuracy": 0.7145360708236694, + "num_tokens": 411951582.0, + "step": 16269 + }, + { + "epoch": 1.7867340215242697, + "grad_norm": 2.6713223457336426, + "learning_rate": 1e-06, + "loss": 0.8347, + "mean_token_accuracy": 0.739841639995575, + "num_tokens": 411969231.0, + "step": 16270 + }, + { + "epoch": 1.7868438392268833, + "grad_norm": 2.7906084060668945, + "learning_rate": 1e-06, + "loss": 0.8724, + "mean_token_accuracy": 0.7273629307746887, + "num_tokens": 411988450.0, + "step": 16271 + }, + { + "epoch": 1.786953656929497, + "grad_norm": 2.1977803707122803, + "learning_rate": 1e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.6970900297164917, + "num_tokens": 412013338.0, + "step": 16272 + }, + { + "epoch": 1.7870634746321108, + "grad_norm": 2.1577694416046143, + "learning_rate": 1e-06, + "loss": 0.8967, + "mean_token_accuracy": 0.7175170183181763, + "num_tokens": 412039818.0, + "step": 16273 + }, + { + "epoch": 1.7871732923347243, + "grad_norm": 2.40447735786438, + "learning_rate": 1e-06, + "loss": 0.8281, + "mean_token_accuracy": 0.7344411015510559, + "num_tokens": 412061530.0, + "step": 16274 + }, + { + "epoch": 1.7872831100373379, + "grad_norm": 2.291154623031616, + "learning_rate": 1e-06, + "loss": 0.9504, + "mean_token_accuracy": 0.7056261897087097, + "num_tokens": 412087686.0, + "step": 16275 + }, + { + "epoch": 1.7873929277399516, + "grad_norm": 2.2448878288269043, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.720042884349823, + "num_tokens": 412115122.0, + "step": 16276 + }, + { + "epoch": 1.7875027454425654, + "grad_norm": 2.314676523208618, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.71122145652771, + "num_tokens": 412138389.0, + "step": 16277 + }, + { + "epoch": 1.7876125631451791, + "grad_norm": 2.309936285018921, + "learning_rate": 1e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.7274139523506165, + "num_tokens": 412159628.0, + "step": 16278 + }, + { + "epoch": 1.7877223808477927, + "grad_norm": 1.9904882907867432, + "learning_rate": 1e-06, + "loss": 0.8671, + "mean_token_accuracy": 0.726845920085907, + "num_tokens": 412188726.0, + "step": 16279 + }, + { + "epoch": 1.7878321985504062, + "grad_norm": 2.0790209770202637, + "learning_rate": 1e-06, + "loss": 0.9863, + "mean_token_accuracy": 0.6950942277908325, + "num_tokens": 412220064.0, + "step": 16280 + }, + { + "epoch": 1.78794201625302, + "grad_norm": 2.4415128231048584, + "learning_rate": 1e-06, + "loss": 0.9782, + "mean_token_accuracy": 0.6993217468261719, + "num_tokens": 412244202.0, + "step": 16281 + }, + { + "epoch": 1.7880518339556337, + "grad_norm": 2.3745365142822266, + "learning_rate": 1e-06, + "loss": 0.7631, + "mean_token_accuracy": 0.7553868293762207, + "num_tokens": 412265842.0, + "step": 16282 + }, + { + "epoch": 1.7881616516582475, + "grad_norm": 2.1400058269500732, + "learning_rate": 1e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.7186378240585327, + "num_tokens": 412293739.0, + "step": 16283 + }, + { + "epoch": 1.788271469360861, + "grad_norm": 2.3945674896240234, + "learning_rate": 1e-06, + "loss": 0.8878, + "mean_token_accuracy": 0.7338645458221436, + "num_tokens": 412314294.0, + "step": 16284 + }, + { + "epoch": 1.7883812870634745, + "grad_norm": 2.3041651248931885, + "learning_rate": 1e-06, + "loss": 0.959, + "mean_token_accuracy": 0.7021569013595581, + "num_tokens": 412341408.0, + "step": 16285 + }, + { + "epoch": 1.7884911047660883, + "grad_norm": 2.2767250537872314, + "learning_rate": 1e-06, + "loss": 0.8907, + "mean_token_accuracy": 0.7228047847747803, + "num_tokens": 412367266.0, + "step": 16286 + }, + { + "epoch": 1.788600922468702, + "grad_norm": 1.8863873481750488, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7166677713394165, + "num_tokens": 412401291.0, + "step": 16287 + }, + { + "epoch": 1.7887107401713156, + "grad_norm": 2.5058681964874268, + "learning_rate": 1e-06, + "loss": 0.8922, + "mean_token_accuracy": 0.741126537322998, + "num_tokens": 412423478.0, + "step": 16288 + }, + { + "epoch": 1.7888205578739291, + "grad_norm": 2.127091646194458, + "learning_rate": 1e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7130154967308044, + "num_tokens": 412450270.0, + "step": 16289 + }, + { + "epoch": 1.7889303755765429, + "grad_norm": 2.2665281295776367, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7119441628456116, + "num_tokens": 412475200.0, + "step": 16290 + }, + { + "epoch": 1.7890401932791566, + "grad_norm": 2.3229336738586426, + "learning_rate": 1e-06, + "loss": 0.9872, + "mean_token_accuracy": 0.6942278146743774, + "num_tokens": 412508353.0, + "step": 16291 + }, + { + "epoch": 1.7891500109817704, + "grad_norm": 2.377053737640381, + "learning_rate": 1e-06, + "loss": 0.9183, + "mean_token_accuracy": 0.7154102325439453, + "num_tokens": 412530643.0, + "step": 16292 + }, + { + "epoch": 1.789259828684384, + "grad_norm": 2.3755297660827637, + "learning_rate": 1e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7074323892593384, + "num_tokens": 412554743.0, + "step": 16293 + }, + { + "epoch": 1.7893696463869975, + "grad_norm": 2.4460082054138184, + "learning_rate": 1e-06, + "loss": 0.8587, + "mean_token_accuracy": 0.7277194857597351, + "num_tokens": 412575362.0, + "step": 16294 + }, + { + "epoch": 1.7894794640896112, + "grad_norm": 2.6083145141601562, + "learning_rate": 1e-06, + "loss": 0.9594, + "mean_token_accuracy": 0.7075132131576538, + "num_tokens": 412596612.0, + "step": 16295 + }, + { + "epoch": 1.789589281792225, + "grad_norm": 2.0737035274505615, + "learning_rate": 1e-06, + "loss": 0.9789, + "mean_token_accuracy": 0.7028800845146179, + "num_tokens": 412625355.0, + "step": 16296 + }, + { + "epoch": 1.7896990994948385, + "grad_norm": 2.305771589279175, + "learning_rate": 1e-06, + "loss": 0.8409, + "mean_token_accuracy": 0.7314910888671875, + "num_tokens": 412649831.0, + "step": 16297 + }, + { + "epoch": 1.7898089171974523, + "grad_norm": 2.2350597381591797, + "learning_rate": 1e-06, + "loss": 1.0121, + "mean_token_accuracy": 0.6906562447547913, + "num_tokens": 412678349.0, + "step": 16298 + }, + { + "epoch": 1.7899187349000658, + "grad_norm": 2.1385412216186523, + "learning_rate": 1e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.7181999683380127, + "num_tokens": 412705381.0, + "step": 16299 + }, + { + "epoch": 1.7900285526026796, + "grad_norm": 2.3063268661499023, + "learning_rate": 1e-06, + "loss": 0.9282, + "mean_token_accuracy": 0.7094694375991821, + "num_tokens": 412729988.0, + "step": 16300 + }, + { + "epoch": 1.7901383703052933, + "grad_norm": 2.1181514263153076, + "learning_rate": 1e-06, + "loss": 0.8606, + "mean_token_accuracy": 0.7257713079452515, + "num_tokens": 412756601.0, + "step": 16301 + }, + { + "epoch": 1.7902481880079069, + "grad_norm": 2.10799241065979, + "learning_rate": 1e-06, + "loss": 0.8627, + "mean_token_accuracy": 0.7272248864173889, + "num_tokens": 412785315.0, + "step": 16302 + }, + { + "epoch": 1.7903580057105204, + "grad_norm": 2.7030739784240723, + "learning_rate": 1e-06, + "loss": 0.8346, + "mean_token_accuracy": 0.7354851961135864, + "num_tokens": 412802988.0, + "step": 16303 + }, + { + "epoch": 1.7904678234131342, + "grad_norm": 2.100292682647705, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.7060402631759644, + "num_tokens": 412833205.0, + "step": 16304 + }, + { + "epoch": 1.790577641115748, + "grad_norm": 2.1782429218292236, + "learning_rate": 1e-06, + "loss": 0.8731, + "mean_token_accuracy": 0.7246755361557007, + "num_tokens": 412860037.0, + "step": 16305 + }, + { + "epoch": 1.7906874588183617, + "grad_norm": 2.417675495147705, + "learning_rate": 1e-06, + "loss": 0.8181, + "mean_token_accuracy": 0.74000084400177, + "num_tokens": 412882620.0, + "step": 16306 + }, + { + "epoch": 1.7907972765209752, + "grad_norm": 2.0583908557891846, + "learning_rate": 1e-06, + "loss": 0.9544, + "mean_token_accuracy": 0.7141394019126892, + "num_tokens": 412911956.0, + "step": 16307 + }, + { + "epoch": 1.7909070942235887, + "grad_norm": 2.189049243927002, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.709509015083313, + "num_tokens": 412942175.0, + "step": 16308 + }, + { + "epoch": 1.7910169119262025, + "grad_norm": 2.340359687805176, + "learning_rate": 1e-06, + "loss": 0.8726, + "mean_token_accuracy": 0.7236384153366089, + "num_tokens": 412965524.0, + "step": 16309 + }, + { + "epoch": 1.7911267296288162, + "grad_norm": 2.306009292602539, + "learning_rate": 1e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7092040777206421, + "num_tokens": 412990147.0, + "step": 16310 + }, + { + "epoch": 1.7912365473314298, + "grad_norm": 2.617910623550415, + "learning_rate": 1e-06, + "loss": 0.8229, + "mean_token_accuracy": 0.7343059778213501, + "num_tokens": 413009547.0, + "step": 16311 + }, + { + "epoch": 1.7913463650340435, + "grad_norm": 2.2580761909484863, + "learning_rate": 1e-06, + "loss": 0.8525, + "mean_token_accuracy": 0.7326359748840332, + "num_tokens": 413034461.0, + "step": 16312 + }, + { + "epoch": 1.791456182736657, + "grad_norm": 2.2807376384735107, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.705256462097168, + "num_tokens": 413057985.0, + "step": 16313 + }, + { + "epoch": 1.7915660004392708, + "grad_norm": 2.44020676612854, + "learning_rate": 1e-06, + "loss": 0.8972, + "mean_token_accuracy": 0.7276816368103027, + "num_tokens": 413080507.0, + "step": 16314 + }, + { + "epoch": 1.7916758181418846, + "grad_norm": 2.3546602725982666, + "learning_rate": 1e-06, + "loss": 0.8484, + "mean_token_accuracy": 0.727766752243042, + "num_tokens": 413103609.0, + "step": 16315 + }, + { + "epoch": 1.7917856358444981, + "grad_norm": 2.9438974857330322, + "learning_rate": 1e-06, + "loss": 0.8398, + "mean_token_accuracy": 0.7340083122253418, + "num_tokens": 413119608.0, + "step": 16316 + }, + { + "epoch": 1.7918954535471117, + "grad_norm": 2.45294189453125, + "learning_rate": 1e-06, + "loss": 0.8622, + "mean_token_accuracy": 0.7228440046310425, + "num_tokens": 413141044.0, + "step": 16317 + }, + { + "epoch": 1.7920052712497254, + "grad_norm": 2.1646318435668945, + "learning_rate": 1e-06, + "loss": 0.9743, + "mean_token_accuracy": 0.6957010626792908, + "num_tokens": 413168011.0, + "step": 16318 + }, + { + "epoch": 1.7921150889523392, + "grad_norm": 1.9551308155059814, + "learning_rate": 1e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.7158216238021851, + "num_tokens": 413199672.0, + "step": 16319 + }, + { + "epoch": 1.792224906654953, + "grad_norm": 2.2980570793151855, + "learning_rate": 1e-06, + "loss": 0.8716, + "mean_token_accuracy": 0.7215838432312012, + "num_tokens": 413224982.0, + "step": 16320 + }, + { + "epoch": 1.7923347243575665, + "grad_norm": 2.05379319190979, + "learning_rate": 1e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.7132132649421692, + "num_tokens": 413253087.0, + "step": 16321 + }, + { + "epoch": 1.79244454206018, + "grad_norm": 2.256708860397339, + "learning_rate": 1e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.6991676092147827, + "num_tokens": 413280695.0, + "step": 16322 + }, + { + "epoch": 1.7925543597627938, + "grad_norm": 2.4216082096099854, + "learning_rate": 1e-06, + "loss": 0.8673, + "mean_token_accuracy": 0.7307099103927612, + "num_tokens": 413303858.0, + "step": 16323 + }, + { + "epoch": 1.7926641774654075, + "grad_norm": 2.08613657951355, + "learning_rate": 1e-06, + "loss": 0.8368, + "mean_token_accuracy": 0.7392035722732544, + "num_tokens": 413330025.0, + "step": 16324 + }, + { + "epoch": 1.792773995168021, + "grad_norm": 2.213141441345215, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.725763738155365, + "num_tokens": 413356705.0, + "step": 16325 + }, + { + "epoch": 1.7928838128706346, + "grad_norm": 2.3581244945526123, + "learning_rate": 1e-06, + "loss": 0.9959, + "mean_token_accuracy": 0.6954870223999023, + "num_tokens": 413383609.0, + "step": 16326 + }, + { + "epoch": 1.7929936305732483, + "grad_norm": 2.2460691928863525, + "learning_rate": 1e-06, + "loss": 0.9471, + "mean_token_accuracy": 0.713191032409668, + "num_tokens": 413409397.0, + "step": 16327 + }, + { + "epoch": 1.793103448275862, + "grad_norm": 2.0437488555908203, + "learning_rate": 1e-06, + "loss": 0.8504, + "mean_token_accuracy": 0.7330524921417236, + "num_tokens": 413437426.0, + "step": 16328 + }, + { + "epoch": 1.7932132659784759, + "grad_norm": 2.6056103706359863, + "learning_rate": 1e-06, + "loss": 0.8007, + "mean_token_accuracy": 0.7416480779647827, + "num_tokens": 413456169.0, + "step": 16329 + }, + { + "epoch": 1.7933230836810894, + "grad_norm": 2.271977663040161, + "learning_rate": 1e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7141265273094177, + "num_tokens": 413480702.0, + "step": 16330 + }, + { + "epoch": 1.793432901383703, + "grad_norm": 1.877086877822876, + "learning_rate": 1e-06, + "loss": 0.9765, + "mean_token_accuracy": 0.7010018825531006, + "num_tokens": 413516590.0, + "step": 16331 + }, + { + "epoch": 1.7935427190863167, + "grad_norm": 2.5478556156158447, + "learning_rate": 1e-06, + "loss": 0.9158, + "mean_token_accuracy": 0.7242512702941895, + "num_tokens": 413539140.0, + "step": 16332 + }, + { + "epoch": 1.7936525367889304, + "grad_norm": 2.4018759727478027, + "learning_rate": 1e-06, + "loss": 0.8611, + "mean_token_accuracy": 0.7309776544570923, + "num_tokens": 413562337.0, + "step": 16333 + }, + { + "epoch": 1.7937623544915442, + "grad_norm": 2.3196420669555664, + "learning_rate": 1e-06, + "loss": 0.8931, + "mean_token_accuracy": 0.7251006364822388, + "num_tokens": 413588386.0, + "step": 16334 + }, + { + "epoch": 1.7938721721941577, + "grad_norm": 2.3046908378601074, + "learning_rate": 1e-06, + "loss": 0.89, + "mean_token_accuracy": 0.7285230159759521, + "num_tokens": 413611555.0, + "step": 16335 + }, + { + "epoch": 1.7939819898967713, + "grad_norm": 2.3324403762817383, + "learning_rate": 1e-06, + "loss": 0.7784, + "mean_token_accuracy": 0.7522614002227783, + "num_tokens": 413633364.0, + "step": 16336 + }, + { + "epoch": 1.794091807599385, + "grad_norm": 2.4068844318389893, + "learning_rate": 1e-06, + "loss": 0.8251, + "mean_token_accuracy": 0.7317166924476624, + "num_tokens": 413655482.0, + "step": 16337 + }, + { + "epoch": 1.7942016253019988, + "grad_norm": 2.0521039962768555, + "learning_rate": 1e-06, + "loss": 0.8923, + "mean_token_accuracy": 0.7233181595802307, + "num_tokens": 413683095.0, + "step": 16338 + }, + { + "epoch": 1.7943114430046123, + "grad_norm": 2.00350022315979, + "learning_rate": 1e-06, + "loss": 1.0108, + "mean_token_accuracy": 0.6982500553131104, + "num_tokens": 413715531.0, + "step": 16339 + }, + { + "epoch": 1.7944212607072259, + "grad_norm": 2.359449863433838, + "learning_rate": 1e-06, + "loss": 0.8639, + "mean_token_accuracy": 0.7302907705307007, + "num_tokens": 413738592.0, + "step": 16340 + }, + { + "epoch": 1.7945310784098396, + "grad_norm": 2.5958917140960693, + "learning_rate": 1e-06, + "loss": 0.9001, + "mean_token_accuracy": 0.7183210849761963, + "num_tokens": 413758892.0, + "step": 16341 + }, + { + "epoch": 1.7946408961124534, + "grad_norm": 2.230449914932251, + "learning_rate": 1e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.716609001159668, + "num_tokens": 413785744.0, + "step": 16342 + }, + { + "epoch": 1.7947507138150671, + "grad_norm": 2.0854971408843994, + "learning_rate": 1e-06, + "loss": 0.8691, + "mean_token_accuracy": 0.7255693674087524, + "num_tokens": 413815142.0, + "step": 16343 + }, + { + "epoch": 1.7948605315176807, + "grad_norm": 2.083698272705078, + "learning_rate": 1e-06, + "loss": 0.8641, + "mean_token_accuracy": 0.7251406908035278, + "num_tokens": 413844522.0, + "step": 16344 + }, + { + "epoch": 1.7949703492202942, + "grad_norm": 2.0831236839294434, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.7150939702987671, + "num_tokens": 413876892.0, + "step": 16345 + }, + { + "epoch": 1.795080166922908, + "grad_norm": 3.0107152462005615, + "learning_rate": 1e-06, + "loss": 0.8188, + "mean_token_accuracy": 0.7356125712394714, + "num_tokens": 413892442.0, + "step": 16346 + }, + { + "epoch": 1.7951899846255217, + "grad_norm": 2.2648420333862305, + "learning_rate": 1e-06, + "loss": 0.852, + "mean_token_accuracy": 0.7337995767593384, + "num_tokens": 413917098.0, + "step": 16347 + }, + { + "epoch": 1.7952998023281352, + "grad_norm": 2.1951942443847656, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.6933209896087646, + "num_tokens": 413943458.0, + "step": 16348 + }, + { + "epoch": 1.795409620030749, + "grad_norm": 2.3034355640411377, + "learning_rate": 1e-06, + "loss": 0.8753, + "mean_token_accuracy": 0.7246365547180176, + "num_tokens": 413968250.0, + "step": 16349 + }, + { + "epoch": 1.7955194377333625, + "grad_norm": 2.4071662425994873, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7177557945251465, + "num_tokens": 413991886.0, + "step": 16350 + }, + { + "epoch": 1.7956292554359763, + "grad_norm": 2.33642840385437, + "learning_rate": 1e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.6960248947143555, + "num_tokens": 414017904.0, + "step": 16351 + }, + { + "epoch": 1.79573907313859, + "grad_norm": 2.2189221382141113, + "learning_rate": 1e-06, + "loss": 0.776, + "mean_token_accuracy": 0.7572685480117798, + "num_tokens": 414042712.0, + "step": 16352 + }, + { + "epoch": 1.7958488908412036, + "grad_norm": 2.4510550498962402, + "learning_rate": 1e-06, + "loss": 0.8431, + "mean_token_accuracy": 0.7383952140808105, + "num_tokens": 414067796.0, + "step": 16353 + }, + { + "epoch": 1.7959587085438171, + "grad_norm": 2.4879133701324463, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.71613609790802, + "num_tokens": 414091130.0, + "step": 16354 + }, + { + "epoch": 1.7960685262464309, + "grad_norm": 2.0686261653900146, + "learning_rate": 1e-06, + "loss": 0.8448, + "mean_token_accuracy": 0.73768150806427, + "num_tokens": 414120453.0, + "step": 16355 + }, + { + "epoch": 1.7961783439490446, + "grad_norm": 2.0248608589172363, + "learning_rate": 1e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.7227728366851807, + "num_tokens": 414151173.0, + "step": 16356 + }, + { + "epoch": 1.7962881616516584, + "grad_norm": 2.6467063426971436, + "learning_rate": 1e-06, + "loss": 0.8952, + "mean_token_accuracy": 0.7177720069885254, + "num_tokens": 414172305.0, + "step": 16357 + }, + { + "epoch": 1.796397979354272, + "grad_norm": 1.867370367050171, + "learning_rate": 1e-06, + "loss": 0.9317, + "mean_token_accuracy": 0.7170946598052979, + "num_tokens": 414206391.0, + "step": 16358 + }, + { + "epoch": 1.7965077970568855, + "grad_norm": 2.094583749771118, + "learning_rate": 1e-06, + "loss": 0.992, + "mean_token_accuracy": 0.6967568397521973, + "num_tokens": 414235825.0, + "step": 16359 + }, + { + "epoch": 1.7966176147594992, + "grad_norm": 2.1912200450897217, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.7108447551727295, + "num_tokens": 414261157.0, + "step": 16360 + }, + { + "epoch": 1.796727432462113, + "grad_norm": 2.2822773456573486, + "learning_rate": 1e-06, + "loss": 0.8147, + "mean_token_accuracy": 0.748052716255188, + "num_tokens": 414284923.0, + "step": 16361 + }, + { + "epoch": 1.7968372501647265, + "grad_norm": 2.4020743370056152, + "learning_rate": 1e-06, + "loss": 0.8924, + "mean_token_accuracy": 0.7125967741012573, + "num_tokens": 414307440.0, + "step": 16362 + }, + { + "epoch": 1.7969470678673403, + "grad_norm": 2.4452388286590576, + "learning_rate": 1e-06, + "loss": 0.9027, + "mean_token_accuracy": 0.7194628715515137, + "num_tokens": 414330819.0, + "step": 16363 + }, + { + "epoch": 1.7970568855699538, + "grad_norm": 2.233170509338379, + "learning_rate": 1e-06, + "loss": 0.8469, + "mean_token_accuracy": 0.7339192628860474, + "num_tokens": 414356041.0, + "step": 16364 + }, + { + "epoch": 1.7971667032725676, + "grad_norm": 1.9337570667266846, + "learning_rate": 1e-06, + "loss": 0.9249, + "mean_token_accuracy": 0.7094963192939758, + "num_tokens": 414388506.0, + "step": 16365 + }, + { + "epoch": 1.7972765209751813, + "grad_norm": 2.2840001583099365, + "learning_rate": 1e-06, + "loss": 0.8631, + "mean_token_accuracy": 0.7337437272071838, + "num_tokens": 414412966.0, + "step": 16366 + }, + { + "epoch": 1.7973863386777948, + "grad_norm": 2.199028253555298, + "learning_rate": 1e-06, + "loss": 0.8976, + "mean_token_accuracy": 0.7200407981872559, + "num_tokens": 414437968.0, + "step": 16367 + }, + { + "epoch": 1.7974961563804084, + "grad_norm": 2.2663252353668213, + "learning_rate": 1e-06, + "loss": 0.9276, + "mean_token_accuracy": 0.7083083987236023, + "num_tokens": 414466750.0, + "step": 16368 + }, + { + "epoch": 1.7976059740830221, + "grad_norm": 2.1652796268463135, + "learning_rate": 1e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.7302484512329102, + "num_tokens": 414493481.0, + "step": 16369 + }, + { + "epoch": 1.797715791785636, + "grad_norm": 2.578761577606201, + "learning_rate": 1e-06, + "loss": 0.8704, + "mean_token_accuracy": 0.7294599413871765, + "num_tokens": 414512970.0, + "step": 16370 + }, + { + "epoch": 1.7978256094882497, + "grad_norm": 2.243711233139038, + "learning_rate": 1e-06, + "loss": 0.8771, + "mean_token_accuracy": 0.7242556810379028, + "num_tokens": 414537399.0, + "step": 16371 + }, + { + "epoch": 1.7979354271908632, + "grad_norm": 2.552112340927124, + "learning_rate": 1e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.7152613997459412, + "num_tokens": 414557946.0, + "step": 16372 + }, + { + "epoch": 1.7980452448934767, + "grad_norm": 1.9507516622543335, + "learning_rate": 1e-06, + "loss": 0.8532, + "mean_token_accuracy": 0.7267059683799744, + "num_tokens": 414589492.0, + "step": 16373 + }, + { + "epoch": 1.7981550625960905, + "grad_norm": 2.646937847137451, + "learning_rate": 1e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7174976468086243, + "num_tokens": 414609888.0, + "step": 16374 + }, + { + "epoch": 1.7982648802987042, + "grad_norm": 2.2074880599975586, + "learning_rate": 1e-06, + "loss": 0.7963, + "mean_token_accuracy": 0.7525380253791809, + "num_tokens": 414634104.0, + "step": 16375 + }, + { + "epoch": 1.7983746980013178, + "grad_norm": 2.1675853729248047, + "learning_rate": 1e-06, + "loss": 0.8312, + "mean_token_accuracy": 0.7401967644691467, + "num_tokens": 414660301.0, + "step": 16376 + }, + { + "epoch": 1.7984845157039313, + "grad_norm": 2.3005125522613525, + "learning_rate": 1e-06, + "loss": 0.9174, + "mean_token_accuracy": 0.7232831716537476, + "num_tokens": 414683804.0, + "step": 16377 + }, + { + "epoch": 1.798594333406545, + "grad_norm": 2.3952441215515137, + "learning_rate": 1e-06, + "loss": 0.8462, + "mean_token_accuracy": 0.7272230982780457, + "num_tokens": 414706934.0, + "step": 16378 + }, + { + "epoch": 1.7987041511091588, + "grad_norm": 2.0298001766204834, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.6975592374801636, + "num_tokens": 414737701.0, + "step": 16379 + }, + { + "epoch": 1.7988139688117726, + "grad_norm": 2.1130926609039307, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7111198902130127, + "num_tokens": 414765516.0, + "step": 16380 + }, + { + "epoch": 1.7989237865143861, + "grad_norm": 1.989762544631958, + "learning_rate": 1e-06, + "loss": 0.8835, + "mean_token_accuracy": 0.7247588634490967, + "num_tokens": 414798352.0, + "step": 16381 + }, + { + "epoch": 1.7990336042169996, + "grad_norm": 2.1167609691619873, + "learning_rate": 1e-06, + "loss": 0.847, + "mean_token_accuracy": 0.7323601245880127, + "num_tokens": 414825887.0, + "step": 16382 + }, + { + "epoch": 1.7991434219196134, + "grad_norm": 2.091336965560913, + "learning_rate": 1e-06, + "loss": 0.932, + "mean_token_accuracy": 0.7105882167816162, + "num_tokens": 414856949.0, + "step": 16383 + }, + { + "epoch": 1.7992532396222272, + "grad_norm": 2.3658435344696045, + "learning_rate": 1e-06, + "loss": 0.8225, + "mean_token_accuracy": 0.749005913734436, + "num_tokens": 414880411.0, + "step": 16384 + }, + { + "epoch": 1.799363057324841, + "grad_norm": 2.3605592250823975, + "learning_rate": 1e-06, + "loss": 0.941, + "mean_token_accuracy": 0.7033318877220154, + "num_tokens": 414904234.0, + "step": 16385 + }, + { + "epoch": 1.7994728750274545, + "grad_norm": 1.9684637784957886, + "learning_rate": 1e-06, + "loss": 0.9494, + "mean_token_accuracy": 0.7079665064811707, + "num_tokens": 414936391.0, + "step": 16386 + }, + { + "epoch": 1.799582692730068, + "grad_norm": 2.512781858444214, + "learning_rate": 1e-06, + "loss": 0.9839, + "mean_token_accuracy": 0.6965426206588745, + "num_tokens": 414960257.0, + "step": 16387 + }, + { + "epoch": 1.7996925104326817, + "grad_norm": 2.5564656257629395, + "learning_rate": 1e-06, + "loss": 0.7819, + "mean_token_accuracy": 0.7453272938728333, + "num_tokens": 414979811.0, + "step": 16388 + }, + { + "epoch": 1.7998023281352955, + "grad_norm": 2.4745852947235107, + "learning_rate": 1e-06, + "loss": 0.8448, + "mean_token_accuracy": 0.7318462133407593, + "num_tokens": 415000434.0, + "step": 16389 + }, + { + "epoch": 1.799912145837909, + "grad_norm": 2.180633783340454, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7073367834091187, + "num_tokens": 415026637.0, + "step": 16390 + }, + { + "epoch": 1.8000219635405226, + "grad_norm": 2.092170000076294, + "learning_rate": 1e-06, + "loss": 0.7989, + "mean_token_accuracy": 0.7432159781455994, + "num_tokens": 415051948.0, + "step": 16391 + }, + { + "epoch": 1.8001317812431363, + "grad_norm": 2.2053656578063965, + "learning_rate": 1e-06, + "loss": 0.9209, + "mean_token_accuracy": 0.7133077383041382, + "num_tokens": 415077988.0, + "step": 16392 + }, + { + "epoch": 1.80024159894575, + "grad_norm": 2.275484561920166, + "learning_rate": 1e-06, + "loss": 0.9701, + "mean_token_accuracy": 0.6990780234336853, + "num_tokens": 415101998.0, + "step": 16393 + }, + { + "epoch": 1.8003514166483638, + "grad_norm": 1.9380601644515991, + "learning_rate": 1e-06, + "loss": 0.957, + "mean_token_accuracy": 0.6997306942939758, + "num_tokens": 415133730.0, + "step": 16394 + }, + { + "epoch": 1.8004612343509774, + "grad_norm": 2.0330541133880615, + "learning_rate": 1e-06, + "loss": 0.8949, + "mean_token_accuracy": 0.7205359935760498, + "num_tokens": 415163217.0, + "step": 16395 + }, + { + "epoch": 1.800571052053591, + "grad_norm": 2.411341905593872, + "learning_rate": 1e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.7173653841018677, + "num_tokens": 415188304.0, + "step": 16396 + }, + { + "epoch": 1.8006808697562047, + "grad_norm": 2.2511515617370605, + "learning_rate": 1e-06, + "loss": 0.8887, + "mean_token_accuracy": 0.7216655015945435, + "num_tokens": 415215111.0, + "step": 16397 + }, + { + "epoch": 1.8007906874588184, + "grad_norm": 2.3965282440185547, + "learning_rate": 1e-06, + "loss": 0.8528, + "mean_token_accuracy": 0.7318658828735352, + "num_tokens": 415238246.0, + "step": 16398 + }, + { + "epoch": 1.8009005051614322, + "grad_norm": 2.2782130241394043, + "learning_rate": 1e-06, + "loss": 0.815, + "mean_token_accuracy": 0.739399790763855, + "num_tokens": 415262032.0, + "step": 16399 + }, + { + "epoch": 1.8010103228640457, + "grad_norm": 2.1298201084136963, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.709668755531311, + "num_tokens": 415288721.0, + "step": 16400 + }, + { + "epoch": 1.8011201405666593, + "grad_norm": 2.6844375133514404, + "learning_rate": 1e-06, + "loss": 0.7745, + "mean_token_accuracy": 0.7606561779975891, + "num_tokens": 415306820.0, + "step": 16401 + }, + { + "epoch": 1.801229958269273, + "grad_norm": 2.2590956687927246, + "learning_rate": 1e-06, + "loss": 0.8758, + "mean_token_accuracy": 0.723175048828125, + "num_tokens": 415333543.0, + "step": 16402 + }, + { + "epoch": 1.8013397759718868, + "grad_norm": 2.2679028511047363, + "learning_rate": 1e-06, + "loss": 0.8517, + "mean_token_accuracy": 0.7363633513450623, + "num_tokens": 415357890.0, + "step": 16403 + }, + { + "epoch": 1.8014495936745003, + "grad_norm": 2.5457539558410645, + "learning_rate": 1e-06, + "loss": 0.8651, + "mean_token_accuracy": 0.7252092361450195, + "num_tokens": 415378011.0, + "step": 16404 + }, + { + "epoch": 1.8015594113771138, + "grad_norm": 2.3976211547851562, + "learning_rate": 1e-06, + "loss": 0.8999, + "mean_token_accuracy": 0.7206502556800842, + "num_tokens": 415401096.0, + "step": 16405 + }, + { + "epoch": 1.8016692290797276, + "grad_norm": 2.0574042797088623, + "learning_rate": 1e-06, + "loss": 0.8922, + "mean_token_accuracy": 0.7234745621681213, + "num_tokens": 415430097.0, + "step": 16406 + }, + { + "epoch": 1.8017790467823414, + "grad_norm": 2.263282060623169, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7072685956954956, + "num_tokens": 415458362.0, + "step": 16407 + }, + { + "epoch": 1.801888864484955, + "grad_norm": 2.48881459236145, + "learning_rate": 1e-06, + "loss": 0.953, + "mean_token_accuracy": 0.7160065770149231, + "num_tokens": 415482420.0, + "step": 16408 + }, + { + "epoch": 1.8019986821875686, + "grad_norm": 2.23445987701416, + "learning_rate": 1e-06, + "loss": 0.9629, + "mean_token_accuracy": 0.7039365768432617, + "num_tokens": 415511472.0, + "step": 16409 + }, + { + "epoch": 1.8021084998901822, + "grad_norm": 2.1108858585357666, + "learning_rate": 1e-06, + "loss": 0.8813, + "mean_token_accuracy": 0.727901041507721, + "num_tokens": 415540006.0, + "step": 16410 + }, + { + "epoch": 1.802218317592796, + "grad_norm": 2.2622830867767334, + "learning_rate": 1e-06, + "loss": 0.8349, + "mean_token_accuracy": 0.7410514950752258, + "num_tokens": 415565942.0, + "step": 16411 + }, + { + "epoch": 1.8023281352954097, + "grad_norm": 2.0296058654785156, + "learning_rate": 1e-06, + "loss": 0.8637, + "mean_token_accuracy": 0.7294466495513916, + "num_tokens": 415594559.0, + "step": 16412 + }, + { + "epoch": 1.8024379529980232, + "grad_norm": 2.1734399795532227, + "learning_rate": 1e-06, + "loss": 0.885, + "mean_token_accuracy": 0.7252304553985596, + "num_tokens": 415623101.0, + "step": 16413 + }, + { + "epoch": 1.802547770700637, + "grad_norm": 2.320732355117798, + "learning_rate": 1e-06, + "loss": 0.9695, + "mean_token_accuracy": 0.6977825164794922, + "num_tokens": 415648055.0, + "step": 16414 + }, + { + "epoch": 1.8026575884032505, + "grad_norm": 1.9380817413330078, + "learning_rate": 1e-06, + "loss": 1.0329, + "mean_token_accuracy": 0.6856915950775146, + "num_tokens": 415685354.0, + "step": 16415 + }, + { + "epoch": 1.8027674061058643, + "grad_norm": 2.0547242164611816, + "learning_rate": 1e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7203972935676575, + "num_tokens": 415714377.0, + "step": 16416 + }, + { + "epoch": 1.802877223808478, + "grad_norm": 1.9089717864990234, + "learning_rate": 1e-06, + "loss": 0.9217, + "mean_token_accuracy": 0.7132061719894409, + "num_tokens": 415748502.0, + "step": 16417 + }, + { + "epoch": 1.8029870415110916, + "grad_norm": 2.2504079341888428, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7080133557319641, + "num_tokens": 415773393.0, + "step": 16418 + }, + { + "epoch": 1.803096859213705, + "grad_norm": 2.0950374603271484, + "learning_rate": 1e-06, + "loss": 0.8499, + "mean_token_accuracy": 0.7348650097846985, + "num_tokens": 415799477.0, + "step": 16419 + }, + { + "epoch": 1.8032066769163189, + "grad_norm": 2.338242530822754, + "learning_rate": 1e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.6973921060562134, + "num_tokens": 415822915.0, + "step": 16420 + }, + { + "epoch": 1.8033164946189326, + "grad_norm": 2.5209672451019287, + "learning_rate": 1e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.7014403343200684, + "num_tokens": 415844757.0, + "step": 16421 + }, + { + "epoch": 1.8034263123215464, + "grad_norm": 2.3054511547088623, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.7135545015335083, + "num_tokens": 415868450.0, + "step": 16422 + }, + { + "epoch": 1.80353613002416, + "grad_norm": 2.337557554244995, + "learning_rate": 1e-06, + "loss": 0.8854, + "mean_token_accuracy": 0.7164447903633118, + "num_tokens": 415891683.0, + "step": 16423 + }, + { + "epoch": 1.8036459477267734, + "grad_norm": 2.070765495300293, + "learning_rate": 1e-06, + "loss": 1.0514, + "mean_token_accuracy": 0.6825622320175171, + "num_tokens": 415921771.0, + "step": 16424 + }, + { + "epoch": 1.8037557654293872, + "grad_norm": 2.2394704818725586, + "learning_rate": 1e-06, + "loss": 0.9792, + "mean_token_accuracy": 0.7002819180488586, + "num_tokens": 415949871.0, + "step": 16425 + }, + { + "epoch": 1.803865583132001, + "grad_norm": 2.405982255935669, + "learning_rate": 1e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.7125413417816162, + "num_tokens": 415972915.0, + "step": 16426 + }, + { + "epoch": 1.8039754008346145, + "grad_norm": 2.148120164871216, + "learning_rate": 1e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.7322338223457336, + "num_tokens": 415998998.0, + "step": 16427 + }, + { + "epoch": 1.8040852185372283, + "grad_norm": 2.019573926925659, + "learning_rate": 1e-06, + "loss": 0.8223, + "mean_token_accuracy": 0.7414968013763428, + "num_tokens": 416026286.0, + "step": 16428 + }, + { + "epoch": 1.8041950362398418, + "grad_norm": 2.661893129348755, + "learning_rate": 1e-06, + "loss": 0.7261, + "mean_token_accuracy": 0.7664512395858765, + "num_tokens": 416045109.0, + "step": 16429 + }, + { + "epoch": 1.8043048539424555, + "grad_norm": 2.2063186168670654, + "learning_rate": 1e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.71894371509552, + "num_tokens": 416071465.0, + "step": 16430 + }, + { + "epoch": 1.8044146716450693, + "grad_norm": 1.9462329149246216, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7038859128952026, + "num_tokens": 416105128.0, + "step": 16431 + }, + { + "epoch": 1.8045244893476828, + "grad_norm": 2.40871524810791, + "learning_rate": 1e-06, + "loss": 0.8831, + "mean_token_accuracy": 0.730002760887146, + "num_tokens": 416127077.0, + "step": 16432 + }, + { + "epoch": 1.8046343070502964, + "grad_norm": 1.9942697286605835, + "learning_rate": 1e-06, + "loss": 0.808, + "mean_token_accuracy": 0.7332128286361694, + "num_tokens": 416156435.0, + "step": 16433 + }, + { + "epoch": 1.8047441247529101, + "grad_norm": 2.1522979736328125, + "learning_rate": 1e-06, + "loss": 0.9002, + "mean_token_accuracy": 0.7225174903869629, + "num_tokens": 416183197.0, + "step": 16434 + }, + { + "epoch": 1.8048539424555239, + "grad_norm": 2.048640251159668, + "learning_rate": 1e-06, + "loss": 0.9115, + "mean_token_accuracy": 0.7134361267089844, + "num_tokens": 416213579.0, + "step": 16435 + }, + { + "epoch": 1.8049637601581376, + "grad_norm": 2.6746010780334473, + "learning_rate": 1e-06, + "loss": 0.791, + "mean_token_accuracy": 0.7492936849594116, + "num_tokens": 416232134.0, + "step": 16436 + }, + { + "epoch": 1.8050735778607512, + "grad_norm": 2.234905958175659, + "learning_rate": 1e-06, + "loss": 0.8919, + "mean_token_accuracy": 0.7193752527236938, + "num_tokens": 416256258.0, + "step": 16437 + }, + { + "epoch": 1.8051833955633647, + "grad_norm": 2.0557093620300293, + "learning_rate": 1e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.6989384889602661, + "num_tokens": 416288125.0, + "step": 16438 + }, + { + "epoch": 1.8052932132659785, + "grad_norm": 2.325505018234253, + "learning_rate": 1e-06, + "loss": 0.8914, + "mean_token_accuracy": 0.7278208136558533, + "num_tokens": 416311906.0, + "step": 16439 + }, + { + "epoch": 1.8054030309685922, + "grad_norm": 2.220945358276367, + "learning_rate": 1e-06, + "loss": 1.049, + "mean_token_accuracy": 0.6809635162353516, + "num_tokens": 416340386.0, + "step": 16440 + }, + { + "epoch": 1.8055128486712058, + "grad_norm": 2.328874349594116, + "learning_rate": 1e-06, + "loss": 0.9824, + "mean_token_accuracy": 0.6970699429512024, + "num_tokens": 416365222.0, + "step": 16441 + }, + { + "epoch": 1.8056226663738193, + "grad_norm": 2.342860460281372, + "learning_rate": 1e-06, + "loss": 0.8892, + "mean_token_accuracy": 0.7251405715942383, + "num_tokens": 416390329.0, + "step": 16442 + }, + { + "epoch": 1.805732484076433, + "grad_norm": 2.044868230819702, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.7037695050239563, + "num_tokens": 416422787.0, + "step": 16443 + }, + { + "epoch": 1.8058423017790468, + "grad_norm": 2.2225868701934814, + "learning_rate": 1e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.7159438133239746, + "num_tokens": 416450448.0, + "step": 16444 + }, + { + "epoch": 1.8059521194816606, + "grad_norm": 2.5991153717041016, + "learning_rate": 1e-06, + "loss": 0.871, + "mean_token_accuracy": 0.7294120192527771, + "num_tokens": 416471169.0, + "step": 16445 + }, + { + "epoch": 1.806061937184274, + "grad_norm": 2.253380537033081, + "learning_rate": 1e-06, + "loss": 0.8631, + "mean_token_accuracy": 0.7277730703353882, + "num_tokens": 416497022.0, + "step": 16446 + }, + { + "epoch": 1.8061717548868876, + "grad_norm": 2.2181084156036377, + "learning_rate": 1e-06, + "loss": 0.877, + "mean_token_accuracy": 0.7272577285766602, + "num_tokens": 416523717.0, + "step": 16447 + }, + { + "epoch": 1.8062815725895014, + "grad_norm": 2.405863046646118, + "learning_rate": 1e-06, + "loss": 0.8125, + "mean_token_accuracy": 0.7434502840042114, + "num_tokens": 416546521.0, + "step": 16448 + }, + { + "epoch": 1.8063913902921152, + "grad_norm": 2.1951074600219727, + "learning_rate": 1e-06, + "loss": 0.9077, + "mean_token_accuracy": 0.7228502035140991, + "num_tokens": 416572810.0, + "step": 16449 + }, + { + "epoch": 1.806501207994729, + "grad_norm": 2.375354528427124, + "learning_rate": 1e-06, + "loss": 0.8198, + "mean_token_accuracy": 0.7401741743087769, + "num_tokens": 416596422.0, + "step": 16450 + }, + { + "epoch": 1.8066110256973424, + "grad_norm": 2.437645435333252, + "learning_rate": 1e-06, + "loss": 0.96, + "mean_token_accuracy": 0.7070873379707336, + "num_tokens": 416619198.0, + "step": 16451 + }, + { + "epoch": 1.806720843399956, + "grad_norm": 2.280143976211548, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7219592332839966, + "num_tokens": 416643900.0, + "step": 16452 + }, + { + "epoch": 1.8068306611025697, + "grad_norm": 2.0207772254943848, + "learning_rate": 1e-06, + "loss": 0.833, + "mean_token_accuracy": 0.7398648262023926, + "num_tokens": 416672216.0, + "step": 16453 + }, + { + "epoch": 1.8069404788051835, + "grad_norm": 2.209085464477539, + "learning_rate": 1e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.7274762392044067, + "num_tokens": 416697287.0, + "step": 16454 + }, + { + "epoch": 1.807050296507797, + "grad_norm": 2.5426323413848877, + "learning_rate": 1e-06, + "loss": 0.8217, + "mean_token_accuracy": 0.7430631518363953, + "num_tokens": 416717053.0, + "step": 16455 + }, + { + "epoch": 1.8071601142104106, + "grad_norm": 2.0740158557891846, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7137439250946045, + "num_tokens": 416746591.0, + "step": 16456 + }, + { + "epoch": 1.8072699319130243, + "grad_norm": 2.2333312034606934, + "learning_rate": 1e-06, + "loss": 0.9168, + "mean_token_accuracy": 0.714805543422699, + "num_tokens": 416771732.0, + "step": 16457 + }, + { + "epoch": 1.807379749615638, + "grad_norm": 2.120398998260498, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.7042498588562012, + "num_tokens": 416800423.0, + "step": 16458 + }, + { + "epoch": 1.8074895673182518, + "grad_norm": 1.9692572355270386, + "learning_rate": 1e-06, + "loss": 0.843, + "mean_token_accuracy": 0.7422757148742676, + "num_tokens": 416830326.0, + "step": 16459 + }, + { + "epoch": 1.8075993850208654, + "grad_norm": 2.5782055854797363, + "learning_rate": 1e-06, + "loss": 0.7714, + "mean_token_accuracy": 0.7533044815063477, + "num_tokens": 416849598.0, + "step": 16460 + }, + { + "epoch": 1.807709202723479, + "grad_norm": 2.3512701988220215, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7193637490272522, + "num_tokens": 416875450.0, + "step": 16461 + }, + { + "epoch": 1.8078190204260927, + "grad_norm": 2.3664448261260986, + "learning_rate": 1e-06, + "loss": 0.8239, + "mean_token_accuracy": 0.7433509826660156, + "num_tokens": 416897638.0, + "step": 16462 + }, + { + "epoch": 1.8079288381287064, + "grad_norm": 1.9196531772613525, + "learning_rate": 1e-06, + "loss": 0.9378, + "mean_token_accuracy": 0.7237208485603333, + "num_tokens": 416930529.0, + "step": 16463 + }, + { + "epoch": 1.8080386558313202, + "grad_norm": 2.2910523414611816, + "learning_rate": 1e-06, + "loss": 0.9132, + "mean_token_accuracy": 0.7151039242744446, + "num_tokens": 416953341.0, + "step": 16464 + }, + { + "epoch": 1.8081484735339337, + "grad_norm": 2.1170520782470703, + "learning_rate": 1e-06, + "loss": 0.8796, + "mean_token_accuracy": 0.7244168519973755, + "num_tokens": 416979288.0, + "step": 16465 + }, + { + "epoch": 1.8082582912365472, + "grad_norm": 2.161027193069458, + "learning_rate": 1e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.7067840695381165, + "num_tokens": 417006551.0, + "step": 16466 + }, + { + "epoch": 1.808368108939161, + "grad_norm": 1.9032773971557617, + "learning_rate": 1e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7162694931030273, + "num_tokens": 417041159.0, + "step": 16467 + }, + { + "epoch": 1.8084779266417748, + "grad_norm": 2.0612714290618896, + "learning_rate": 1e-06, + "loss": 0.9664, + "mean_token_accuracy": 0.7030337452888489, + "num_tokens": 417072737.0, + "step": 16468 + }, + { + "epoch": 1.8085877443443883, + "grad_norm": 2.062814950942993, + "learning_rate": 1e-06, + "loss": 0.8808, + "mean_token_accuracy": 0.7283004522323608, + "num_tokens": 417102930.0, + "step": 16469 + }, + { + "epoch": 1.8086975620470018, + "grad_norm": 2.357557535171509, + "learning_rate": 1e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.7257810831069946, + "num_tokens": 417127922.0, + "step": 16470 + }, + { + "epoch": 1.8088073797496156, + "grad_norm": 1.937046766281128, + "learning_rate": 1e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.7101362943649292, + "num_tokens": 417160913.0, + "step": 16471 + }, + { + "epoch": 1.8089171974522293, + "grad_norm": 2.3264098167419434, + "learning_rate": 1e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.7023587822914124, + "num_tokens": 417185355.0, + "step": 16472 + }, + { + "epoch": 1.809027015154843, + "grad_norm": 2.3697874546051025, + "learning_rate": 1e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.7233625650405884, + "num_tokens": 417210110.0, + "step": 16473 + }, + { + "epoch": 1.8091368328574566, + "grad_norm": 2.4250757694244385, + "learning_rate": 1e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.6975356340408325, + "num_tokens": 417234623.0, + "step": 16474 + }, + { + "epoch": 1.8092466505600702, + "grad_norm": 2.2067017555236816, + "learning_rate": 1e-06, + "loss": 0.8802, + "mean_token_accuracy": 0.7229968309402466, + "num_tokens": 417259189.0, + "step": 16475 + }, + { + "epoch": 1.809356468262684, + "grad_norm": 2.243605852127075, + "learning_rate": 1e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.7052825689315796, + "num_tokens": 417283536.0, + "step": 16476 + }, + { + "epoch": 1.8094662859652977, + "grad_norm": 1.906567096710205, + "learning_rate": 1e-06, + "loss": 1.0052, + "mean_token_accuracy": 0.6887803077697754, + "num_tokens": 417317494.0, + "step": 16477 + }, + { + "epoch": 1.8095761036679112, + "grad_norm": 2.5002408027648926, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7205761671066284, + "num_tokens": 417339386.0, + "step": 16478 + }, + { + "epoch": 1.809685921370525, + "grad_norm": 2.205028772354126, + "learning_rate": 1e-06, + "loss": 0.8907, + "mean_token_accuracy": 0.7230744361877441, + "num_tokens": 417363394.0, + "step": 16479 + }, + { + "epoch": 1.8097957390731385, + "grad_norm": 2.4849116802215576, + "learning_rate": 1e-06, + "loss": 0.8712, + "mean_token_accuracy": 0.7394933104515076, + "num_tokens": 417385957.0, + "step": 16480 + }, + { + "epoch": 1.8099055567757523, + "grad_norm": 2.4666149616241455, + "learning_rate": 1e-06, + "loss": 0.8958, + "mean_token_accuracy": 0.7198912501335144, + "num_tokens": 417408797.0, + "step": 16481 + }, + { + "epoch": 1.810015374478366, + "grad_norm": 2.577491044998169, + "learning_rate": 1e-06, + "loss": 0.8607, + "mean_token_accuracy": 0.7223217487335205, + "num_tokens": 417429271.0, + "step": 16482 + }, + { + "epoch": 1.8101251921809796, + "grad_norm": 2.0954749584198, + "learning_rate": 1e-06, + "loss": 0.8934, + "mean_token_accuracy": 0.7301902770996094, + "num_tokens": 417458801.0, + "step": 16483 + }, + { + "epoch": 1.810235009883593, + "grad_norm": 2.4937291145324707, + "learning_rate": 1e-06, + "loss": 0.8846, + "mean_token_accuracy": 0.7312278747558594, + "num_tokens": 417481220.0, + "step": 16484 + }, + { + "epoch": 1.8103448275862069, + "grad_norm": 2.2527503967285156, + "learning_rate": 1e-06, + "loss": 0.8797, + "mean_token_accuracy": 0.7194554209709167, + "num_tokens": 417506710.0, + "step": 16485 + }, + { + "epoch": 1.8104546452888206, + "grad_norm": 2.4258360862731934, + "learning_rate": 1e-06, + "loss": 0.8333, + "mean_token_accuracy": 0.7320876121520996, + "num_tokens": 417529726.0, + "step": 16486 + }, + { + "epoch": 1.8105644629914344, + "grad_norm": 2.3053956031799316, + "learning_rate": 1e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.7101447582244873, + "num_tokens": 417554350.0, + "step": 16487 + }, + { + "epoch": 1.810674280694048, + "grad_norm": 2.5930142402648926, + "learning_rate": 1e-06, + "loss": 0.9047, + "mean_token_accuracy": 0.7234204411506653, + "num_tokens": 417574247.0, + "step": 16488 + }, + { + "epoch": 1.8107840983966614, + "grad_norm": 1.9014384746551514, + "learning_rate": 1e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.7167782783508301, + "num_tokens": 417609113.0, + "step": 16489 + }, + { + "epoch": 1.8108939160992752, + "grad_norm": 2.104381799697876, + "learning_rate": 1e-06, + "loss": 0.9014, + "mean_token_accuracy": 0.7099258899688721, + "num_tokens": 417637355.0, + "step": 16490 + }, + { + "epoch": 1.811003733801889, + "grad_norm": 2.0316710472106934, + "learning_rate": 1e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.716380774974823, + "num_tokens": 417665761.0, + "step": 16491 + }, + { + "epoch": 1.8111135515045025, + "grad_norm": 2.2745954990386963, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7125335931777954, + "num_tokens": 417690101.0, + "step": 16492 + }, + { + "epoch": 1.8112233692071162, + "grad_norm": 2.2165515422821045, + "learning_rate": 1e-06, + "loss": 0.8605, + "mean_token_accuracy": 0.726382315158844, + "num_tokens": 417716525.0, + "step": 16493 + }, + { + "epoch": 1.8113331869097298, + "grad_norm": 2.721653461456299, + "learning_rate": 1e-06, + "loss": 0.8476, + "mean_token_accuracy": 0.7330793142318726, + "num_tokens": 417735047.0, + "step": 16494 + }, + { + "epoch": 1.8114430046123435, + "grad_norm": 2.4135756492614746, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7105754613876343, + "num_tokens": 417758299.0, + "step": 16495 + }, + { + "epoch": 1.8115528223149573, + "grad_norm": 2.0433120727539062, + "learning_rate": 1e-06, + "loss": 0.8682, + "mean_token_accuracy": 0.7318288087844849, + "num_tokens": 417788916.0, + "step": 16496 + }, + { + "epoch": 1.8116626400175708, + "grad_norm": 2.530445098876953, + "learning_rate": 1e-06, + "loss": 0.8746, + "mean_token_accuracy": 0.7220611572265625, + "num_tokens": 417809092.0, + "step": 16497 + }, + { + "epoch": 1.8117724577201844, + "grad_norm": 2.5698084831237793, + "learning_rate": 1e-06, + "loss": 0.7864, + "mean_token_accuracy": 0.752399742603302, + "num_tokens": 417827592.0, + "step": 16498 + }, + { + "epoch": 1.8118822754227981, + "grad_norm": 2.0083861351013184, + "learning_rate": 1e-06, + "loss": 0.8818, + "mean_token_accuracy": 0.7226662039756775, + "num_tokens": 417858416.0, + "step": 16499 + }, + { + "epoch": 1.8119920931254119, + "grad_norm": 2.2619621753692627, + "learning_rate": 1e-06, + "loss": 0.8908, + "mean_token_accuracy": 0.7280277013778687, + "num_tokens": 417883027.0, + "step": 16500 + }, + { + "epoch": 1.8121019108280256, + "grad_norm": 2.1042563915252686, + "learning_rate": 1e-06, + "loss": 0.8939, + "mean_token_accuracy": 0.718148410320282, + "num_tokens": 417910546.0, + "step": 16501 + }, + { + "epoch": 1.8122117285306392, + "grad_norm": 2.409156322479248, + "learning_rate": 1e-06, + "loss": 0.8992, + "mean_token_accuracy": 0.7182071208953857, + "num_tokens": 417934938.0, + "step": 16502 + }, + { + "epoch": 1.8123215462332527, + "grad_norm": 1.8986783027648926, + "learning_rate": 1e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7068959474563599, + "num_tokens": 417969745.0, + "step": 16503 + }, + { + "epoch": 1.8124313639358665, + "grad_norm": 2.4132556915283203, + "learning_rate": 1e-06, + "loss": 0.8829, + "mean_token_accuracy": 0.725170910358429, + "num_tokens": 417993321.0, + "step": 16504 + }, + { + "epoch": 1.8125411816384802, + "grad_norm": 2.3127028942108154, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7016304731369019, + "num_tokens": 418020840.0, + "step": 16505 + }, + { + "epoch": 1.8126509993410937, + "grad_norm": 2.4695019721984863, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7060540914535522, + "num_tokens": 418045851.0, + "step": 16506 + }, + { + "epoch": 1.8127608170437073, + "grad_norm": 2.3288400173187256, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7043331265449524, + "num_tokens": 418070901.0, + "step": 16507 + }, + { + "epoch": 1.812870634746321, + "grad_norm": 2.6365411281585693, + "learning_rate": 1e-06, + "loss": 0.8836, + "mean_token_accuracy": 0.7205557823181152, + "num_tokens": 418091067.0, + "step": 16508 + }, + { + "epoch": 1.8129804524489348, + "grad_norm": 2.237104654312134, + "learning_rate": 1e-06, + "loss": 0.8923, + "mean_token_accuracy": 0.7252812385559082, + "num_tokens": 418116676.0, + "step": 16509 + }, + { + "epoch": 1.8130902701515486, + "grad_norm": 1.9542217254638672, + "learning_rate": 1e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.7201907634735107, + "num_tokens": 418151247.0, + "step": 16510 + }, + { + "epoch": 1.813200087854162, + "grad_norm": 2.4636831283569336, + "learning_rate": 1e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7184984683990479, + "num_tokens": 418173660.0, + "step": 16511 + }, + { + "epoch": 1.8133099055567756, + "grad_norm": 2.3717994689941406, + "learning_rate": 1e-06, + "loss": 0.7143, + "mean_token_accuracy": 0.7726771831512451, + "num_tokens": 418193761.0, + "step": 16512 + }, + { + "epoch": 1.8134197232593894, + "grad_norm": 2.427450656890869, + "learning_rate": 1e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.7155721783638, + "num_tokens": 418217026.0, + "step": 16513 + }, + { + "epoch": 1.8135295409620031, + "grad_norm": 2.4115517139434814, + "learning_rate": 1e-06, + "loss": 0.8976, + "mean_token_accuracy": 0.7200025916099548, + "num_tokens": 418241670.0, + "step": 16514 + }, + { + "epoch": 1.813639358664617, + "grad_norm": 2.144258737564087, + "learning_rate": 1e-06, + "loss": 0.8369, + "mean_token_accuracy": 0.732422947883606, + "num_tokens": 418272869.0, + "step": 16515 + }, + { + "epoch": 1.8137491763672304, + "grad_norm": 2.2771408557891846, + "learning_rate": 1e-06, + "loss": 0.9042, + "mean_token_accuracy": 0.7228571772575378, + "num_tokens": 418296864.0, + "step": 16516 + }, + { + "epoch": 1.813858994069844, + "grad_norm": 2.6354005336761475, + "learning_rate": 1e-06, + "loss": 0.8856, + "mean_token_accuracy": 0.723160982131958, + "num_tokens": 418316974.0, + "step": 16517 + }, + { + "epoch": 1.8139688117724577, + "grad_norm": 2.131511688232422, + "learning_rate": 1e-06, + "loss": 0.8679, + "mean_token_accuracy": 0.7276369333267212, + "num_tokens": 418342946.0, + "step": 16518 + }, + { + "epoch": 1.8140786294750715, + "grad_norm": 2.2076220512390137, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.7102006673812866, + "num_tokens": 418369502.0, + "step": 16519 + }, + { + "epoch": 1.814188447177685, + "grad_norm": 2.3552086353302, + "learning_rate": 1e-06, + "loss": 0.8513, + "mean_token_accuracy": 0.728317141532898, + "num_tokens": 418392689.0, + "step": 16520 + }, + { + "epoch": 1.8142982648802986, + "grad_norm": 2.232363700866699, + "learning_rate": 1e-06, + "loss": 0.8151, + "mean_token_accuracy": 0.7435378432273865, + "num_tokens": 418416250.0, + "step": 16521 + }, + { + "epoch": 1.8144080825829123, + "grad_norm": 2.1141650676727295, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7139993906021118, + "num_tokens": 418444603.0, + "step": 16522 + }, + { + "epoch": 1.814517900285526, + "grad_norm": 2.3911075592041016, + "learning_rate": 1e-06, + "loss": 0.8094, + "mean_token_accuracy": 0.7405988574028015, + "num_tokens": 418466047.0, + "step": 16523 + }, + { + "epoch": 1.8146277179881398, + "grad_norm": 2.5775978565216064, + "learning_rate": 1e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.7218409776687622, + "num_tokens": 418487270.0, + "step": 16524 + }, + { + "epoch": 1.8147375356907534, + "grad_norm": 2.509023666381836, + "learning_rate": 1e-06, + "loss": 0.8361, + "mean_token_accuracy": 0.733200192451477, + "num_tokens": 418507514.0, + "step": 16525 + }, + { + "epoch": 1.814847353393367, + "grad_norm": 2.5585997104644775, + "learning_rate": 1e-06, + "loss": 0.8647, + "mean_token_accuracy": 0.7291234731674194, + "num_tokens": 418528340.0, + "step": 16526 + }, + { + "epoch": 1.8149571710959806, + "grad_norm": 2.226893663406372, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7017987966537476, + "num_tokens": 418556386.0, + "step": 16527 + }, + { + "epoch": 1.8150669887985944, + "grad_norm": 2.5474724769592285, + "learning_rate": 1e-06, + "loss": 0.8792, + "mean_token_accuracy": 0.723203182220459, + "num_tokens": 418576885.0, + "step": 16528 + }, + { + "epoch": 1.815176806501208, + "grad_norm": 2.133408784866333, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7159192562103271, + "num_tokens": 418604202.0, + "step": 16529 + }, + { + "epoch": 1.8152866242038217, + "grad_norm": 2.2539572715759277, + "learning_rate": 1e-06, + "loss": 0.8309, + "mean_token_accuracy": 0.7330917716026306, + "num_tokens": 418630211.0, + "step": 16530 + }, + { + "epoch": 1.8153964419064352, + "grad_norm": 2.5651602745056152, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7108759880065918, + "num_tokens": 418651833.0, + "step": 16531 + }, + { + "epoch": 1.815506259609049, + "grad_norm": 1.9856809377670288, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7112998962402344, + "num_tokens": 418683201.0, + "step": 16532 + }, + { + "epoch": 1.8156160773116627, + "grad_norm": 2.227752447128296, + "learning_rate": 1e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.7232428789138794, + "num_tokens": 418708335.0, + "step": 16533 + }, + { + "epoch": 1.8157258950142763, + "grad_norm": 2.0574610233306885, + "learning_rate": 1e-06, + "loss": 0.9256, + "mean_token_accuracy": 0.7065190076828003, + "num_tokens": 418736906.0, + "step": 16534 + }, + { + "epoch": 1.8158357127168898, + "grad_norm": 2.1186883449554443, + "learning_rate": 1e-06, + "loss": 0.9079, + "mean_token_accuracy": 0.721747636795044, + "num_tokens": 418764776.0, + "step": 16535 + }, + { + "epoch": 1.8159455304195036, + "grad_norm": 2.363016366958618, + "learning_rate": 1e-06, + "loss": 1.0108, + "mean_token_accuracy": 0.6872240304946899, + "num_tokens": 418789664.0, + "step": 16536 + }, + { + "epoch": 1.8160553481221173, + "grad_norm": 2.267998218536377, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7156215906143188, + "num_tokens": 418816479.0, + "step": 16537 + }, + { + "epoch": 1.816165165824731, + "grad_norm": 2.36354398727417, + "learning_rate": 1e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.7196974754333496, + "num_tokens": 418839799.0, + "step": 16538 + }, + { + "epoch": 1.8162749835273446, + "grad_norm": 2.0874853134155273, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7129396200180054, + "num_tokens": 418869913.0, + "step": 16539 + }, + { + "epoch": 1.8163848012299582, + "grad_norm": 2.4204835891723633, + "learning_rate": 1e-06, + "loss": 0.8821, + "mean_token_accuracy": 0.7279671430587769, + "num_tokens": 418891748.0, + "step": 16540 + }, + { + "epoch": 1.816494618932572, + "grad_norm": 2.045192003250122, + "learning_rate": 1e-06, + "loss": 0.9107, + "mean_token_accuracy": 0.7176498174667358, + "num_tokens": 418925577.0, + "step": 16541 + }, + { + "epoch": 1.8166044366351857, + "grad_norm": 2.504211902618408, + "learning_rate": 1e-06, + "loss": 0.8325, + "mean_token_accuracy": 0.7318422198295593, + "num_tokens": 418946856.0, + "step": 16542 + }, + { + "epoch": 1.8167142543377992, + "grad_norm": 2.330594539642334, + "learning_rate": 1e-06, + "loss": 0.8952, + "mean_token_accuracy": 0.7198474407196045, + "num_tokens": 418972211.0, + "step": 16543 + }, + { + "epoch": 1.816824072040413, + "grad_norm": 1.9124460220336914, + "learning_rate": 1e-06, + "loss": 0.9931, + "mean_token_accuracy": 0.6935728192329407, + "num_tokens": 419006499.0, + "step": 16544 + }, + { + "epoch": 1.8169338897430265, + "grad_norm": 2.069046974182129, + "learning_rate": 1e-06, + "loss": 0.9788, + "mean_token_accuracy": 0.7043945789337158, + "num_tokens": 419037368.0, + "step": 16545 + }, + { + "epoch": 1.8170437074456403, + "grad_norm": 2.489473581314087, + "learning_rate": 1e-06, + "loss": 0.8883, + "mean_token_accuracy": 0.7304825782775879, + "num_tokens": 419059053.0, + "step": 16546 + }, + { + "epoch": 1.817153525148254, + "grad_norm": 2.070305109024048, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7160780429840088, + "num_tokens": 419087358.0, + "step": 16547 + }, + { + "epoch": 1.8172633428508675, + "grad_norm": 2.521578073501587, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7060348987579346, + "num_tokens": 419110502.0, + "step": 16548 + }, + { + "epoch": 1.817373160553481, + "grad_norm": 2.263169050216675, + "learning_rate": 1e-06, + "loss": 0.9086, + "mean_token_accuracy": 0.7162454128265381, + "num_tokens": 419134451.0, + "step": 16549 + }, + { + "epoch": 1.8174829782560948, + "grad_norm": 2.524221658706665, + "learning_rate": 1e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.7249583005905151, + "num_tokens": 419156791.0, + "step": 16550 + }, + { + "epoch": 1.8175927959587086, + "grad_norm": 2.0743942260742188, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7108582258224487, + "num_tokens": 419187156.0, + "step": 16551 + }, + { + "epoch": 1.8177026136613224, + "grad_norm": 2.4195187091827393, + "learning_rate": 1e-06, + "loss": 0.8663, + "mean_token_accuracy": 0.7306537628173828, + "num_tokens": 419210439.0, + "step": 16552 + }, + { + "epoch": 1.8178124313639359, + "grad_norm": 2.016434669494629, + "learning_rate": 1e-06, + "loss": 0.8606, + "mean_token_accuracy": 0.7311341762542725, + "num_tokens": 419240896.0, + "step": 16553 + }, + { + "epoch": 1.8179222490665494, + "grad_norm": 2.196226119995117, + "learning_rate": 1e-06, + "loss": 0.791, + "mean_token_accuracy": 0.7531419992446899, + "num_tokens": 419263994.0, + "step": 16554 + }, + { + "epoch": 1.8180320667691632, + "grad_norm": 2.520096778869629, + "learning_rate": 1e-06, + "loss": 0.9479, + "mean_token_accuracy": 0.7014658451080322, + "num_tokens": 419289251.0, + "step": 16555 + }, + { + "epoch": 1.818141884471777, + "grad_norm": 2.3325273990631104, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7069132924079895, + "num_tokens": 419314267.0, + "step": 16556 + }, + { + "epoch": 1.8182517021743905, + "grad_norm": 2.4595894813537598, + "learning_rate": 1e-06, + "loss": 0.8525, + "mean_token_accuracy": 0.7287987470626831, + "num_tokens": 419334593.0, + "step": 16557 + }, + { + "epoch": 1.818361519877004, + "grad_norm": 2.149611711502075, + "learning_rate": 1e-06, + "loss": 0.986, + "mean_token_accuracy": 0.6930553913116455, + "num_tokens": 419363292.0, + "step": 16558 + }, + { + "epoch": 1.8184713375796178, + "grad_norm": 2.234168767929077, + "learning_rate": 1e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.7274284362792969, + "num_tokens": 419388851.0, + "step": 16559 + }, + { + "epoch": 1.8185811552822315, + "grad_norm": 2.20185923576355, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7051134705543518, + "num_tokens": 419415123.0, + "step": 16560 + }, + { + "epoch": 1.8186909729848453, + "grad_norm": 2.5540716648101807, + "learning_rate": 1e-06, + "loss": 0.874, + "mean_token_accuracy": 0.7231943607330322, + "num_tokens": 419438506.0, + "step": 16561 + }, + { + "epoch": 1.8188007906874588, + "grad_norm": 2.3470075130462646, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7128726243972778, + "num_tokens": 419461880.0, + "step": 16562 + }, + { + "epoch": 1.8189106083900723, + "grad_norm": 2.537083625793457, + "learning_rate": 1e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.7101974487304688, + "num_tokens": 419484824.0, + "step": 16563 + }, + { + "epoch": 1.819020426092686, + "grad_norm": 2.0679216384887695, + "learning_rate": 1e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.7155749201774597, + "num_tokens": 419514235.0, + "step": 16564 + }, + { + "epoch": 1.8191302437952999, + "grad_norm": 2.2848587036132812, + "learning_rate": 1e-06, + "loss": 0.8893, + "mean_token_accuracy": 0.7274772524833679, + "num_tokens": 419537711.0, + "step": 16565 + }, + { + "epoch": 1.8192400614979136, + "grad_norm": 2.499117136001587, + "learning_rate": 1e-06, + "loss": 0.7936, + "mean_token_accuracy": 0.751570463180542, + "num_tokens": 419559359.0, + "step": 16566 + }, + { + "epoch": 1.8193498792005272, + "grad_norm": 2.1627283096313477, + "learning_rate": 1e-06, + "loss": 0.8393, + "mean_token_accuracy": 0.7340232729911804, + "num_tokens": 419585526.0, + "step": 16567 + }, + { + "epoch": 1.8194596969031407, + "grad_norm": 2.3059794902801514, + "learning_rate": 1e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7216440439224243, + "num_tokens": 419607709.0, + "step": 16568 + }, + { + "epoch": 1.8195695146057544, + "grad_norm": 2.265709638595581, + "learning_rate": 1e-06, + "loss": 0.8706, + "mean_token_accuracy": 0.7222356796264648, + "num_tokens": 419630948.0, + "step": 16569 + }, + { + "epoch": 1.8196793323083682, + "grad_norm": 2.5448429584503174, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7043534517288208, + "num_tokens": 419652498.0, + "step": 16570 + }, + { + "epoch": 1.8197891500109817, + "grad_norm": 2.9104552268981934, + "learning_rate": 1e-06, + "loss": 0.8928, + "mean_token_accuracy": 0.724315345287323, + "num_tokens": 419670064.0, + "step": 16571 + }, + { + "epoch": 1.8198989677135953, + "grad_norm": 2.293386459350586, + "learning_rate": 1e-06, + "loss": 0.9833, + "mean_token_accuracy": 0.6977419853210449, + "num_tokens": 419695438.0, + "step": 16572 + }, + { + "epoch": 1.820008785416209, + "grad_norm": 2.4342522621154785, + "learning_rate": 1e-06, + "loss": 0.8425, + "mean_token_accuracy": 0.7312412261962891, + "num_tokens": 419717091.0, + "step": 16573 + }, + { + "epoch": 1.8201186031188228, + "grad_norm": 2.252345561981201, + "learning_rate": 1e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.7082089185714722, + "num_tokens": 419743327.0, + "step": 16574 + }, + { + "epoch": 1.8202284208214365, + "grad_norm": 2.1378917694091797, + "learning_rate": 1e-06, + "loss": 0.8877, + "mean_token_accuracy": 0.7274991273880005, + "num_tokens": 419768779.0, + "step": 16575 + }, + { + "epoch": 1.82033823852405, + "grad_norm": 2.536216974258423, + "learning_rate": 1e-06, + "loss": 0.8944, + "mean_token_accuracy": 0.7185978889465332, + "num_tokens": 419789353.0, + "step": 16576 + }, + { + "epoch": 1.8204480562266636, + "grad_norm": 2.4171323776245117, + "learning_rate": 1e-06, + "loss": 0.902, + "mean_token_accuracy": 0.7198215126991272, + "num_tokens": 419814403.0, + "step": 16577 + }, + { + "epoch": 1.8205578739292774, + "grad_norm": 2.396191358566284, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.7040438652038574, + "num_tokens": 419837402.0, + "step": 16578 + }, + { + "epoch": 1.8206676916318911, + "grad_norm": 2.121419906616211, + "learning_rate": 1e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.6981616616249084, + "num_tokens": 419864664.0, + "step": 16579 + }, + { + "epoch": 1.8207775093345049, + "grad_norm": 2.2928383350372314, + "learning_rate": 1e-06, + "loss": 0.8383, + "mean_token_accuracy": 0.7379312515258789, + "num_tokens": 419888254.0, + "step": 16580 + }, + { + "epoch": 1.8208873270371184, + "grad_norm": 2.519289016723633, + "learning_rate": 1e-06, + "loss": 0.779, + "mean_token_accuracy": 0.7490357160568237, + "num_tokens": 419908064.0, + "step": 16581 + }, + { + "epoch": 1.820997144739732, + "grad_norm": 2.579052209854126, + "learning_rate": 1e-06, + "loss": 0.8397, + "mean_token_accuracy": 0.7282294034957886, + "num_tokens": 419928505.0, + "step": 16582 + }, + { + "epoch": 1.8211069624423457, + "grad_norm": 2.155522584915161, + "learning_rate": 1e-06, + "loss": 0.8409, + "mean_token_accuracy": 0.7339193820953369, + "num_tokens": 419956275.0, + "step": 16583 + }, + { + "epoch": 1.8212167801449595, + "grad_norm": 2.0822269916534424, + "learning_rate": 1e-06, + "loss": 1.0491, + "mean_token_accuracy": 0.6871392130851746, + "num_tokens": 419986414.0, + "step": 16584 + }, + { + "epoch": 1.821326597847573, + "grad_norm": 2.413750410079956, + "learning_rate": 1e-06, + "loss": 0.757, + "mean_token_accuracy": 0.7545130848884583, + "num_tokens": 420007067.0, + "step": 16585 + }, + { + "epoch": 1.8214364155501865, + "grad_norm": 2.183535099029541, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.713182806968689, + "num_tokens": 420034735.0, + "step": 16586 + }, + { + "epoch": 1.8215462332528003, + "grad_norm": 2.2697904109954834, + "learning_rate": 1e-06, + "loss": 0.8148, + "mean_token_accuracy": 0.7389733195304871, + "num_tokens": 420059362.0, + "step": 16587 + }, + { + "epoch": 1.821656050955414, + "grad_norm": 1.9307719469070435, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7083802223205566, + "num_tokens": 420093815.0, + "step": 16588 + }, + { + "epoch": 1.8217658686580278, + "grad_norm": 2.1347742080688477, + "learning_rate": 1e-06, + "loss": 0.8575, + "mean_token_accuracy": 0.7253017425537109, + "num_tokens": 420120851.0, + "step": 16589 + }, + { + "epoch": 1.8218756863606413, + "grad_norm": 2.2327651977539062, + "learning_rate": 1e-06, + "loss": 0.8674, + "mean_token_accuracy": 0.7271124124526978, + "num_tokens": 420145280.0, + "step": 16590 + }, + { + "epoch": 1.8219855040632549, + "grad_norm": 1.972537636756897, + "learning_rate": 1e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.7082845568656921, + "num_tokens": 420174349.0, + "step": 16591 + }, + { + "epoch": 1.8220953217658686, + "grad_norm": 2.201680898666382, + "learning_rate": 1e-06, + "loss": 0.8788, + "mean_token_accuracy": 0.7223000526428223, + "num_tokens": 420201197.0, + "step": 16592 + }, + { + "epoch": 1.8222051394684824, + "grad_norm": 2.0475032329559326, + "learning_rate": 1e-06, + "loss": 0.917, + "mean_token_accuracy": 0.7178797721862793, + "num_tokens": 420230738.0, + "step": 16593 + }, + { + "epoch": 1.822314957171096, + "grad_norm": 2.143862247467041, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7147100567817688, + "num_tokens": 420259392.0, + "step": 16594 + }, + { + "epoch": 1.8224247748737097, + "grad_norm": 2.3556551933288574, + "learning_rate": 1e-06, + "loss": 0.845, + "mean_token_accuracy": 0.7301693558692932, + "num_tokens": 420282659.0, + "step": 16595 + }, + { + "epoch": 1.8225345925763232, + "grad_norm": 2.226606845855713, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7170565128326416, + "num_tokens": 420309187.0, + "step": 16596 + }, + { + "epoch": 1.822644410278937, + "grad_norm": 2.2043638229370117, + "learning_rate": 1e-06, + "loss": 1.0959, + "mean_token_accuracy": 0.6841307282447815, + "num_tokens": 420337983.0, + "step": 16597 + }, + { + "epoch": 1.8227542279815507, + "grad_norm": 2.201134443283081, + "learning_rate": 1e-06, + "loss": 0.9215, + "mean_token_accuracy": 0.7146643400192261, + "num_tokens": 420363893.0, + "step": 16598 + }, + { + "epoch": 1.8228640456841643, + "grad_norm": 2.0728611946105957, + "learning_rate": 1e-06, + "loss": 0.9219, + "mean_token_accuracy": 0.7234523296356201, + "num_tokens": 420393420.0, + "step": 16599 + }, + { + "epoch": 1.8229738633867778, + "grad_norm": 2.1953036785125732, + "learning_rate": 1e-06, + "loss": 0.9003, + "mean_token_accuracy": 0.714867353439331, + "num_tokens": 420418551.0, + "step": 16600 + }, + { + "epoch": 1.8230836810893916, + "grad_norm": 2.154553174972534, + "learning_rate": 1e-06, + "loss": 0.794, + "mean_token_accuracy": 0.7527635097503662, + "num_tokens": 420446510.0, + "step": 16601 + }, + { + "epoch": 1.8231934987920053, + "grad_norm": 2.5874361991882324, + "learning_rate": 1e-06, + "loss": 0.8997, + "mean_token_accuracy": 0.7176201343536377, + "num_tokens": 420467585.0, + "step": 16602 + }, + { + "epoch": 1.823303316494619, + "grad_norm": 2.1099510192871094, + "learning_rate": 1e-06, + "loss": 0.896, + "mean_token_accuracy": 0.7258137464523315, + "num_tokens": 420495153.0, + "step": 16603 + }, + { + "epoch": 1.8234131341972326, + "grad_norm": 2.1744277477264404, + "learning_rate": 1e-06, + "loss": 0.8886, + "mean_token_accuracy": 0.7271372079849243, + "num_tokens": 420520649.0, + "step": 16604 + }, + { + "epoch": 1.8235229518998461, + "grad_norm": 2.1359901428222656, + "learning_rate": 1e-06, + "loss": 0.9471, + "mean_token_accuracy": 0.7069585919380188, + "num_tokens": 420547575.0, + "step": 16605 + }, + { + "epoch": 1.82363276960246, + "grad_norm": 2.236597776412964, + "learning_rate": 1e-06, + "loss": 0.8729, + "mean_token_accuracy": 0.7279236316680908, + "num_tokens": 420572134.0, + "step": 16606 + }, + { + "epoch": 1.8237425873050737, + "grad_norm": 2.3800032138824463, + "learning_rate": 1e-06, + "loss": 0.9242, + "mean_token_accuracy": 0.7100871205329895, + "num_tokens": 420595183.0, + "step": 16607 + }, + { + "epoch": 1.8238524050076872, + "grad_norm": 2.121885061264038, + "learning_rate": 1e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.7105729579925537, + "num_tokens": 420621801.0, + "step": 16608 + }, + { + "epoch": 1.823962222710301, + "grad_norm": 2.4886057376861572, + "learning_rate": 1e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.7154689431190491, + "num_tokens": 420643126.0, + "step": 16609 + }, + { + "epoch": 1.8240720404129145, + "grad_norm": 2.2238776683807373, + "learning_rate": 1e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.720093309879303, + "num_tokens": 420668537.0, + "step": 16610 + }, + { + "epoch": 1.8241818581155282, + "grad_norm": 2.5480542182922363, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.6970638036727905, + "num_tokens": 420690020.0, + "step": 16611 + }, + { + "epoch": 1.824291675818142, + "grad_norm": 2.278264284133911, + "learning_rate": 1e-06, + "loss": 0.8685, + "mean_token_accuracy": 0.726769208908081, + "num_tokens": 420713768.0, + "step": 16612 + }, + { + "epoch": 1.8244014935207555, + "grad_norm": 2.447646141052246, + "learning_rate": 1e-06, + "loss": 0.8686, + "mean_token_accuracy": 0.7267760038375854, + "num_tokens": 420741830.0, + "step": 16613 + }, + { + "epoch": 1.824511311223369, + "grad_norm": 2.0529017448425293, + "learning_rate": 1e-06, + "loss": 0.9007, + "mean_token_accuracy": 0.7196691036224365, + "num_tokens": 420769640.0, + "step": 16614 + }, + { + "epoch": 1.8246211289259828, + "grad_norm": 2.1285159587860107, + "learning_rate": 1e-06, + "loss": 0.7775, + "mean_token_accuracy": 0.7453351020812988, + "num_tokens": 420796497.0, + "step": 16615 + }, + { + "epoch": 1.8247309466285966, + "grad_norm": 2.2009570598602295, + "learning_rate": 1e-06, + "loss": 0.8447, + "mean_token_accuracy": 0.7453187704086304, + "num_tokens": 420820528.0, + "step": 16616 + }, + { + "epoch": 1.8248407643312103, + "grad_norm": 2.178802967071533, + "learning_rate": 1e-06, + "loss": 0.9196, + "mean_token_accuracy": 0.7164418697357178, + "num_tokens": 420848183.0, + "step": 16617 + }, + { + "epoch": 1.8249505820338239, + "grad_norm": 2.0543859004974365, + "learning_rate": 1e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.701689600944519, + "num_tokens": 420880254.0, + "step": 16618 + }, + { + "epoch": 1.8250603997364374, + "grad_norm": 2.1201696395874023, + "learning_rate": 1e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.7168442010879517, + "num_tokens": 420907670.0, + "step": 16619 + }, + { + "epoch": 1.8251702174390512, + "grad_norm": 2.3132379055023193, + "learning_rate": 1e-06, + "loss": 0.8843, + "mean_token_accuracy": 0.7305216789245605, + "num_tokens": 420932688.0, + "step": 16620 + }, + { + "epoch": 1.825280035141665, + "grad_norm": 2.3335092067718506, + "learning_rate": 1e-06, + "loss": 0.9189, + "mean_token_accuracy": 0.710092306137085, + "num_tokens": 420956144.0, + "step": 16621 + }, + { + "epoch": 1.8253898528442785, + "grad_norm": 2.209063768386841, + "learning_rate": 1e-06, + "loss": 0.8703, + "mean_token_accuracy": 0.7329742908477783, + "num_tokens": 420981949.0, + "step": 16622 + }, + { + "epoch": 1.825499670546892, + "grad_norm": 2.4797778129577637, + "learning_rate": 1e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.7186389565467834, + "num_tokens": 421005297.0, + "step": 16623 + }, + { + "epoch": 1.8256094882495058, + "grad_norm": 2.4956483840942383, + "learning_rate": 1e-06, + "loss": 0.7399, + "mean_token_accuracy": 0.7603122591972351, + "num_tokens": 421024783.0, + "step": 16624 + }, + { + "epoch": 1.8257193059521195, + "grad_norm": 2.236887216567993, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7233558297157288, + "num_tokens": 421051500.0, + "step": 16625 + }, + { + "epoch": 1.8258291236547333, + "grad_norm": 2.3273539543151855, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7076882123947144, + "num_tokens": 421077204.0, + "step": 16626 + }, + { + "epoch": 1.8259389413573468, + "grad_norm": 2.1568503379821777, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7103654146194458, + "num_tokens": 421104967.0, + "step": 16627 + }, + { + "epoch": 1.8260487590599603, + "grad_norm": 2.1581671237945557, + "learning_rate": 1e-06, + "loss": 0.8507, + "mean_token_accuracy": 0.7312692403793335, + "num_tokens": 421131781.0, + "step": 16628 + }, + { + "epoch": 1.826158576762574, + "grad_norm": 2.2124550342559814, + "learning_rate": 1e-06, + "loss": 0.9101, + "mean_token_accuracy": 0.7134990692138672, + "num_tokens": 421157261.0, + "step": 16629 + }, + { + "epoch": 1.8262683944651878, + "grad_norm": 2.165289878845215, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7201189994812012, + "num_tokens": 421182573.0, + "step": 16630 + }, + { + "epoch": 1.8263782121678016, + "grad_norm": 2.2067997455596924, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.7040774822235107, + "num_tokens": 421211812.0, + "step": 16631 + }, + { + "epoch": 1.8264880298704151, + "grad_norm": 2.342075824737549, + "learning_rate": 1e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.7050544023513794, + "num_tokens": 421236050.0, + "step": 16632 + }, + { + "epoch": 1.8265978475730287, + "grad_norm": 2.1167590618133545, + "learning_rate": 1e-06, + "loss": 0.9082, + "mean_token_accuracy": 0.7160674929618835, + "num_tokens": 421263961.0, + "step": 16633 + }, + { + "epoch": 1.8267076652756424, + "grad_norm": 2.048086643218994, + "learning_rate": 1e-06, + "loss": 0.8743, + "mean_token_accuracy": 0.7265424728393555, + "num_tokens": 421294384.0, + "step": 16634 + }, + { + "epoch": 1.8268174829782562, + "grad_norm": 2.415133237838745, + "learning_rate": 1e-06, + "loss": 0.9139, + "mean_token_accuracy": 0.7229459285736084, + "num_tokens": 421318881.0, + "step": 16635 + }, + { + "epoch": 1.8269273006808697, + "grad_norm": 2.5298473834991455, + "learning_rate": 1e-06, + "loss": 0.8814, + "mean_token_accuracy": 0.722885012626648, + "num_tokens": 421339628.0, + "step": 16636 + }, + { + "epoch": 1.8270371183834833, + "grad_norm": 2.2946157455444336, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7036969661712646, + "num_tokens": 421363895.0, + "step": 16637 + }, + { + "epoch": 1.827146936086097, + "grad_norm": 2.0264809131622314, + "learning_rate": 1e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.7100463509559631, + "num_tokens": 421395010.0, + "step": 16638 + }, + { + "epoch": 1.8272567537887108, + "grad_norm": 2.0567898750305176, + "learning_rate": 1e-06, + "loss": 0.8547, + "mean_token_accuracy": 0.7310817837715149, + "num_tokens": 421422533.0, + "step": 16639 + }, + { + "epoch": 1.8273665714913245, + "grad_norm": 2.002821445465088, + "learning_rate": 1e-06, + "loss": 0.9168, + "mean_token_accuracy": 0.7060507535934448, + "num_tokens": 421453864.0, + "step": 16640 + }, + { + "epoch": 1.827476389193938, + "grad_norm": 2.5984060764312744, + "learning_rate": 1e-06, + "loss": 0.7752, + "mean_token_accuracy": 0.74748694896698, + "num_tokens": 421471535.0, + "step": 16641 + }, + { + "epoch": 1.8275862068965516, + "grad_norm": 2.2760021686553955, + "learning_rate": 1e-06, + "loss": 0.8332, + "mean_token_accuracy": 0.7342358827590942, + "num_tokens": 421495649.0, + "step": 16642 + }, + { + "epoch": 1.8276960245991654, + "grad_norm": 2.32047963142395, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7061598300933838, + "num_tokens": 421521693.0, + "step": 16643 + }, + { + "epoch": 1.8278058423017791, + "grad_norm": 1.9454246759414673, + "learning_rate": 1e-06, + "loss": 0.9896, + "mean_token_accuracy": 0.6907680034637451, + "num_tokens": 421555135.0, + "step": 16644 + }, + { + "epoch": 1.8279156600043929, + "grad_norm": 2.455854892730713, + "learning_rate": 1e-06, + "loss": 0.7516, + "mean_token_accuracy": 0.7596112489700317, + "num_tokens": 421576041.0, + "step": 16645 + }, + { + "epoch": 1.8280254777070064, + "grad_norm": 2.081169605255127, + "learning_rate": 1e-06, + "loss": 0.8071, + "mean_token_accuracy": 0.7386215925216675, + "num_tokens": 421603179.0, + "step": 16646 + }, + { + "epoch": 1.82813529540962, + "grad_norm": 2.409327268600464, + "learning_rate": 1e-06, + "loss": 0.8951, + "mean_token_accuracy": 0.7115589380264282, + "num_tokens": 421625073.0, + "step": 16647 + }, + { + "epoch": 1.8282451131122337, + "grad_norm": 2.3210854530334473, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.708972692489624, + "num_tokens": 421646093.0, + "step": 16648 + }, + { + "epoch": 1.8283549308148475, + "grad_norm": 2.178741693496704, + "learning_rate": 1e-06, + "loss": 0.8738, + "mean_token_accuracy": 0.7266110181808472, + "num_tokens": 421671342.0, + "step": 16649 + }, + { + "epoch": 1.828464748517461, + "grad_norm": 2.532198190689087, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7041372060775757, + "num_tokens": 421692429.0, + "step": 16650 + }, + { + "epoch": 1.8285745662200745, + "grad_norm": 2.1365160942077637, + "learning_rate": 1e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.7177308797836304, + "num_tokens": 421720509.0, + "step": 16651 + }, + { + "epoch": 1.8286843839226883, + "grad_norm": 2.033548355102539, + "learning_rate": 1e-06, + "loss": 0.8977, + "mean_token_accuracy": 0.7179945707321167, + "num_tokens": 421749081.0, + "step": 16652 + }, + { + "epoch": 1.828794201625302, + "grad_norm": 2.197962760925293, + "learning_rate": 1e-06, + "loss": 0.8483, + "mean_token_accuracy": 0.7330970764160156, + "num_tokens": 421774354.0, + "step": 16653 + }, + { + "epoch": 1.8289040193279158, + "grad_norm": 2.3942909240722656, + "learning_rate": 1e-06, + "loss": 0.9276, + "mean_token_accuracy": 0.713549792766571, + "num_tokens": 421798837.0, + "step": 16654 + }, + { + "epoch": 1.8290138370305293, + "grad_norm": 2.143672466278076, + "learning_rate": 1e-06, + "loss": 0.935, + "mean_token_accuracy": 0.719845175743103, + "num_tokens": 421827388.0, + "step": 16655 + }, + { + "epoch": 1.8291236547331429, + "grad_norm": 1.979214072227478, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7123535871505737, + "num_tokens": 421858030.0, + "step": 16656 + }, + { + "epoch": 1.8292334724357566, + "grad_norm": 2.4281187057495117, + "learning_rate": 1e-06, + "loss": 0.8544, + "mean_token_accuracy": 0.7338926196098328, + "num_tokens": 421879708.0, + "step": 16657 + }, + { + "epoch": 1.8293432901383704, + "grad_norm": 2.4107468128204346, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7038737535476685, + "num_tokens": 421905467.0, + "step": 16658 + }, + { + "epoch": 1.829453107840984, + "grad_norm": 2.84428071975708, + "learning_rate": 1e-06, + "loss": 0.8781, + "mean_token_accuracy": 0.7197278141975403, + "num_tokens": 421923409.0, + "step": 16659 + }, + { + "epoch": 1.8295629255435977, + "grad_norm": 2.3091509342193604, + "learning_rate": 1e-06, + "loss": 0.8979, + "mean_token_accuracy": 0.7276331186294556, + "num_tokens": 421951487.0, + "step": 16660 + }, + { + "epoch": 1.8296727432462112, + "grad_norm": 2.483912706375122, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7299366593360901, + "num_tokens": 421974155.0, + "step": 16661 + }, + { + "epoch": 1.829782560948825, + "grad_norm": 2.034773111343384, + "learning_rate": 1e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.7164139747619629, + "num_tokens": 422002472.0, + "step": 16662 + }, + { + "epoch": 1.8298923786514387, + "grad_norm": 2.2443997859954834, + "learning_rate": 1e-06, + "loss": 0.8679, + "mean_token_accuracy": 0.730699896812439, + "num_tokens": 422025586.0, + "step": 16663 + }, + { + "epoch": 1.8300021963540523, + "grad_norm": 2.309054374694824, + "learning_rate": 1e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.7091695666313171, + "num_tokens": 422049276.0, + "step": 16664 + }, + { + "epoch": 1.8301120140566658, + "grad_norm": 2.379288911819458, + "learning_rate": 1e-06, + "loss": 0.929, + "mean_token_accuracy": 0.7163615226745605, + "num_tokens": 422073154.0, + "step": 16665 + }, + { + "epoch": 1.8302218317592795, + "grad_norm": 2.2540013790130615, + "learning_rate": 1e-06, + "loss": 0.9047, + "mean_token_accuracy": 0.71385258436203, + "num_tokens": 422098417.0, + "step": 16666 + }, + { + "epoch": 1.8303316494618933, + "grad_norm": 2.14310884475708, + "learning_rate": 1e-06, + "loss": 0.8267, + "mean_token_accuracy": 0.7414869070053101, + "num_tokens": 422126484.0, + "step": 16667 + }, + { + "epoch": 1.830441467164507, + "grad_norm": 2.3146562576293945, + "learning_rate": 1e-06, + "loss": 0.7724, + "mean_token_accuracy": 0.7560207843780518, + "num_tokens": 422149518.0, + "step": 16668 + }, + { + "epoch": 1.8305512848671206, + "grad_norm": 2.3018462657928467, + "learning_rate": 1e-06, + "loss": 0.8561, + "mean_token_accuracy": 0.7292856574058533, + "num_tokens": 422172255.0, + "step": 16669 + }, + { + "epoch": 1.8306611025697341, + "grad_norm": 2.521056652069092, + "learning_rate": 1e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.7260864973068237, + "num_tokens": 422192505.0, + "step": 16670 + }, + { + "epoch": 1.830770920272348, + "grad_norm": 2.0119729042053223, + "learning_rate": 1e-06, + "loss": 0.8867, + "mean_token_accuracy": 0.722680389881134, + "num_tokens": 422220509.0, + "step": 16671 + }, + { + "epoch": 1.8308807379749616, + "grad_norm": 2.124877452850342, + "learning_rate": 1e-06, + "loss": 0.8938, + "mean_token_accuracy": 0.719306468963623, + "num_tokens": 422251074.0, + "step": 16672 + }, + { + "epoch": 1.8309905556775752, + "grad_norm": 1.9834918975830078, + "learning_rate": 1e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.6955480575561523, + "num_tokens": 422282545.0, + "step": 16673 + }, + { + "epoch": 1.831100373380189, + "grad_norm": 2.5909786224365234, + "learning_rate": 1e-06, + "loss": 0.8601, + "mean_token_accuracy": 0.7300128936767578, + "num_tokens": 422303994.0, + "step": 16674 + }, + { + "epoch": 1.8312101910828025, + "grad_norm": 2.3926594257354736, + "learning_rate": 1e-06, + "loss": 0.7779, + "mean_token_accuracy": 0.74918532371521, + "num_tokens": 422326622.0, + "step": 16675 + }, + { + "epoch": 1.8313200087854162, + "grad_norm": 2.5170979499816895, + "learning_rate": 1e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.7101200819015503, + "num_tokens": 422350738.0, + "step": 16676 + }, + { + "epoch": 1.83142982648803, + "grad_norm": 2.266771078109741, + "learning_rate": 1e-06, + "loss": 0.8554, + "mean_token_accuracy": 0.7259975671768188, + "num_tokens": 422376502.0, + "step": 16677 + }, + { + "epoch": 1.8315396441906435, + "grad_norm": 2.636819362640381, + "learning_rate": 1e-06, + "loss": 0.8989, + "mean_token_accuracy": 0.7154648303985596, + "num_tokens": 422397318.0, + "step": 16678 + }, + { + "epoch": 1.831649461893257, + "grad_norm": 2.3514163494110107, + "learning_rate": 1e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.7180981636047363, + "num_tokens": 422421259.0, + "step": 16679 + }, + { + "epoch": 1.8317592795958708, + "grad_norm": 2.0393097400665283, + "learning_rate": 1e-06, + "loss": 0.8536, + "mean_token_accuracy": 0.7350488901138306, + "num_tokens": 422450628.0, + "step": 16680 + }, + { + "epoch": 1.8318690972984846, + "grad_norm": 2.218555212020874, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7139836549758911, + "num_tokens": 422475947.0, + "step": 16681 + }, + { + "epoch": 1.8319789150010983, + "grad_norm": 2.3485958576202393, + "learning_rate": 1e-06, + "loss": 0.7955, + "mean_token_accuracy": 0.7401642203330994, + "num_tokens": 422499673.0, + "step": 16682 + }, + { + "epoch": 1.8320887327037119, + "grad_norm": 2.2987961769104004, + "learning_rate": 1e-06, + "loss": 0.8425, + "mean_token_accuracy": 0.7303448915481567, + "num_tokens": 422522038.0, + "step": 16683 + }, + { + "epoch": 1.8321985504063254, + "grad_norm": 2.175236701965332, + "learning_rate": 1e-06, + "loss": 0.9959, + "mean_token_accuracy": 0.6957401037216187, + "num_tokens": 422550241.0, + "step": 16684 + }, + { + "epoch": 1.8323083681089392, + "grad_norm": 2.4665749073028564, + "learning_rate": 1e-06, + "loss": 0.8984, + "mean_token_accuracy": 0.7195563316345215, + "num_tokens": 422572147.0, + "step": 16685 + }, + { + "epoch": 1.832418185811553, + "grad_norm": 2.2938365936279297, + "learning_rate": 1e-06, + "loss": 0.8861, + "mean_token_accuracy": 0.7253993153572083, + "num_tokens": 422595359.0, + "step": 16686 + }, + { + "epoch": 1.8325280035141664, + "grad_norm": 2.0919997692108154, + "learning_rate": 1e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.7093794345855713, + "num_tokens": 422623799.0, + "step": 16687 + }, + { + "epoch": 1.83263782121678, + "grad_norm": 2.4092297554016113, + "learning_rate": 1e-06, + "loss": 0.8217, + "mean_token_accuracy": 0.7386283874511719, + "num_tokens": 422645831.0, + "step": 16688 + }, + { + "epoch": 1.8327476389193937, + "grad_norm": 2.377589225769043, + "learning_rate": 1e-06, + "loss": 0.8713, + "mean_token_accuracy": 0.7285420298576355, + "num_tokens": 422667649.0, + "step": 16689 + }, + { + "epoch": 1.8328574566220075, + "grad_norm": 2.3286020755767822, + "learning_rate": 1e-06, + "loss": 0.9193, + "mean_token_accuracy": 0.717908501625061, + "num_tokens": 422694086.0, + "step": 16690 + }, + { + "epoch": 1.8329672743246213, + "grad_norm": 2.2690725326538086, + "learning_rate": 1e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.7042900919914246, + "num_tokens": 422719232.0, + "step": 16691 + }, + { + "epoch": 1.8330770920272348, + "grad_norm": 2.0464069843292236, + "learning_rate": 1e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.7126644849777222, + "num_tokens": 422747740.0, + "step": 16692 + }, + { + "epoch": 1.8331869097298483, + "grad_norm": 2.149538516998291, + "learning_rate": 1e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.7304721474647522, + "num_tokens": 422775971.0, + "step": 16693 + }, + { + "epoch": 1.833296727432462, + "grad_norm": 1.9323469400405884, + "learning_rate": 1e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.7089796662330627, + "num_tokens": 422808940.0, + "step": 16694 + }, + { + "epoch": 1.8334065451350758, + "grad_norm": 2.10884690284729, + "learning_rate": 1e-06, + "loss": 0.8486, + "mean_token_accuracy": 0.7319377064704895, + "num_tokens": 422834814.0, + "step": 16695 + }, + { + "epoch": 1.8335163628376896, + "grad_norm": 2.62187123298645, + "learning_rate": 1e-06, + "loss": 0.87, + "mean_token_accuracy": 0.726664662361145, + "num_tokens": 422854392.0, + "step": 16696 + }, + { + "epoch": 1.8336261805403031, + "grad_norm": 2.030029535293579, + "learning_rate": 1e-06, + "loss": 0.9479, + "mean_token_accuracy": 0.7010259628295898, + "num_tokens": 422884932.0, + "step": 16697 + }, + { + "epoch": 1.8337359982429167, + "grad_norm": 2.394465684890747, + "learning_rate": 1e-06, + "loss": 0.8102, + "mean_token_accuracy": 0.7432439923286438, + "num_tokens": 422907715.0, + "step": 16698 + }, + { + "epoch": 1.8338458159455304, + "grad_norm": 2.1547837257385254, + "learning_rate": 1e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.7214037775993347, + "num_tokens": 422936882.0, + "step": 16699 + }, + { + "epoch": 1.8339556336481442, + "grad_norm": 2.285943031311035, + "learning_rate": 1e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7252496480941772, + "num_tokens": 422963029.0, + "step": 16700 + }, + { + "epoch": 1.8340654513507577, + "grad_norm": 2.1677448749542236, + "learning_rate": 1e-06, + "loss": 0.8931, + "mean_token_accuracy": 0.7169157266616821, + "num_tokens": 422988356.0, + "step": 16701 + }, + { + "epoch": 1.8341752690533712, + "grad_norm": 2.6551332473754883, + "learning_rate": 1e-06, + "loss": 0.8763, + "mean_token_accuracy": 0.7222554087638855, + "num_tokens": 423008993.0, + "step": 16702 + }, + { + "epoch": 1.834285086755985, + "grad_norm": 2.2592577934265137, + "learning_rate": 1e-06, + "loss": 0.8968, + "mean_token_accuracy": 0.7155507802963257, + "num_tokens": 423034772.0, + "step": 16703 + }, + { + "epoch": 1.8343949044585988, + "grad_norm": 2.059291362762451, + "learning_rate": 1e-06, + "loss": 0.8701, + "mean_token_accuracy": 0.7284166216850281, + "num_tokens": 423062595.0, + "step": 16704 + }, + { + "epoch": 1.8345047221612125, + "grad_norm": 2.0254411697387695, + "learning_rate": 1e-06, + "loss": 0.9166, + "mean_token_accuracy": 0.7124325037002563, + "num_tokens": 423094400.0, + "step": 16705 + }, + { + "epoch": 1.834614539863826, + "grad_norm": 2.2141225337982178, + "learning_rate": 1e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.7212114930152893, + "num_tokens": 423117791.0, + "step": 16706 + }, + { + "epoch": 1.8347243575664396, + "grad_norm": 2.393110752105713, + "learning_rate": 1e-06, + "loss": 0.8546, + "mean_token_accuracy": 0.7320576906204224, + "num_tokens": 423140587.0, + "step": 16707 + }, + { + "epoch": 1.8348341752690533, + "grad_norm": 2.2067298889160156, + "learning_rate": 1e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.7109301090240479, + "num_tokens": 423168616.0, + "step": 16708 + }, + { + "epoch": 1.834943992971667, + "grad_norm": 2.2307581901550293, + "learning_rate": 1e-06, + "loss": 0.9506, + "mean_token_accuracy": 0.7060922384262085, + "num_tokens": 423195213.0, + "step": 16709 + }, + { + "epoch": 1.8350538106742806, + "grad_norm": 2.2649335861206055, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.712824821472168, + "num_tokens": 423220106.0, + "step": 16710 + }, + { + "epoch": 1.8351636283768944, + "grad_norm": 2.4949917793273926, + "learning_rate": 1e-06, + "loss": 0.832, + "mean_token_accuracy": 0.7371343374252319, + "num_tokens": 423240988.0, + "step": 16711 + }, + { + "epoch": 1.835273446079508, + "grad_norm": 2.394226312637329, + "learning_rate": 1e-06, + "loss": 0.893, + "mean_token_accuracy": 0.7242783308029175, + "num_tokens": 423264246.0, + "step": 16712 + }, + { + "epoch": 1.8353832637821217, + "grad_norm": 2.581263780593872, + "learning_rate": 1e-06, + "loss": 0.8516, + "mean_token_accuracy": 0.730819821357727, + "num_tokens": 423285521.0, + "step": 16713 + }, + { + "epoch": 1.8354930814847354, + "grad_norm": 2.4658279418945312, + "learning_rate": 1e-06, + "loss": 0.8866, + "mean_token_accuracy": 0.719340443611145, + "num_tokens": 423307692.0, + "step": 16714 + }, + { + "epoch": 1.835602899187349, + "grad_norm": 2.092902898788452, + "learning_rate": 1e-06, + "loss": 0.8877, + "mean_token_accuracy": 0.7230622172355652, + "num_tokens": 423335496.0, + "step": 16715 + }, + { + "epoch": 1.8357127168899625, + "grad_norm": 2.206490993499756, + "learning_rate": 1e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.7159647345542908, + "num_tokens": 423361675.0, + "step": 16716 + }, + { + "epoch": 1.8358225345925763, + "grad_norm": 2.082084894180298, + "learning_rate": 1e-06, + "loss": 0.9143, + "mean_token_accuracy": 0.7234713435173035, + "num_tokens": 423391770.0, + "step": 16717 + }, + { + "epoch": 1.83593235229519, + "grad_norm": 2.2636466026306152, + "learning_rate": 1e-06, + "loss": 0.8526, + "mean_token_accuracy": 0.7309359312057495, + "num_tokens": 423419649.0, + "step": 16718 + }, + { + "epoch": 1.8360421699978038, + "grad_norm": 2.075967311859131, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.7171669006347656, + "num_tokens": 423451839.0, + "step": 16719 + }, + { + "epoch": 1.8361519877004173, + "grad_norm": 2.2178030014038086, + "learning_rate": 1e-06, + "loss": 0.8306, + "mean_token_accuracy": 0.7368823885917664, + "num_tokens": 423475973.0, + "step": 16720 + }, + { + "epoch": 1.8362618054030309, + "grad_norm": 2.2231850624084473, + "learning_rate": 1e-06, + "loss": 0.8458, + "mean_token_accuracy": 0.7360391616821289, + "num_tokens": 423501464.0, + "step": 16721 + }, + { + "epoch": 1.8363716231056446, + "grad_norm": 2.0936460494995117, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7063989639282227, + "num_tokens": 423529744.0, + "step": 16722 + }, + { + "epoch": 1.8364814408082584, + "grad_norm": 2.178406000137329, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.6990402936935425, + "num_tokens": 423556697.0, + "step": 16723 + }, + { + "epoch": 1.836591258510872, + "grad_norm": 2.177028179168701, + "learning_rate": 1e-06, + "loss": 0.9086, + "mean_token_accuracy": 0.7147625684738159, + "num_tokens": 423586135.0, + "step": 16724 + }, + { + "epoch": 1.8367010762134857, + "grad_norm": 2.440460443496704, + "learning_rate": 1e-06, + "loss": 0.8239, + "mean_token_accuracy": 0.7447748184204102, + "num_tokens": 423608017.0, + "step": 16725 + }, + { + "epoch": 1.8368108939160992, + "grad_norm": 2.6444618701934814, + "learning_rate": 1e-06, + "loss": 0.8549, + "mean_token_accuracy": 0.7334540486335754, + "num_tokens": 423627504.0, + "step": 16726 + }, + { + "epoch": 1.836920711618713, + "grad_norm": 2.2882163524627686, + "learning_rate": 1e-06, + "loss": 0.8967, + "mean_token_accuracy": 0.7265041470527649, + "num_tokens": 423651292.0, + "step": 16727 + }, + { + "epoch": 1.8370305293213267, + "grad_norm": 2.582195997238159, + "learning_rate": 1e-06, + "loss": 0.756, + "mean_token_accuracy": 0.7545411586761475, + "num_tokens": 423670764.0, + "step": 16728 + }, + { + "epoch": 1.8371403470239402, + "grad_norm": 2.3926007747650146, + "learning_rate": 1e-06, + "loss": 0.8269, + "mean_token_accuracy": 0.7417831420898438, + "num_tokens": 423694531.0, + "step": 16729 + }, + { + "epoch": 1.8372501647265538, + "grad_norm": 2.042660713195801, + "learning_rate": 1e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7240660786628723, + "num_tokens": 423726453.0, + "step": 16730 + }, + { + "epoch": 1.8373599824291675, + "grad_norm": 2.55344295501709, + "learning_rate": 1e-06, + "loss": 0.8598, + "mean_token_accuracy": 0.7272077798843384, + "num_tokens": 423747435.0, + "step": 16731 + }, + { + "epoch": 1.8374698001317813, + "grad_norm": 2.128627061843872, + "learning_rate": 1e-06, + "loss": 0.9225, + "mean_token_accuracy": 0.7107900977134705, + "num_tokens": 423774466.0, + "step": 16732 + }, + { + "epoch": 1.837579617834395, + "grad_norm": 2.196340560913086, + "learning_rate": 1e-06, + "loss": 0.8762, + "mean_token_accuracy": 0.720346987247467, + "num_tokens": 423799277.0, + "step": 16733 + }, + { + "epoch": 1.8376894355370086, + "grad_norm": 2.5469725131988525, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7095974683761597, + "num_tokens": 423821186.0, + "step": 16734 + }, + { + "epoch": 1.8377992532396221, + "grad_norm": 2.230682134628296, + "learning_rate": 1e-06, + "loss": 0.8919, + "mean_token_accuracy": 0.7239307165145874, + "num_tokens": 423845089.0, + "step": 16735 + }, + { + "epoch": 1.8379090709422359, + "grad_norm": 2.283965587615967, + "learning_rate": 1e-06, + "loss": 0.8674, + "mean_token_accuracy": 0.7270655035972595, + "num_tokens": 423869980.0, + "step": 16736 + }, + { + "epoch": 1.8380188886448496, + "grad_norm": 2.0664610862731934, + "learning_rate": 1e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.7160841226577759, + "num_tokens": 423899911.0, + "step": 16737 + }, + { + "epoch": 1.8381287063474632, + "grad_norm": 2.7050158977508545, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7300524711608887, + "num_tokens": 423920019.0, + "step": 16738 + }, + { + "epoch": 1.838238524050077, + "grad_norm": 2.039423942565918, + "learning_rate": 1e-06, + "loss": 0.877, + "mean_token_accuracy": 0.71904456615448, + "num_tokens": 423949575.0, + "step": 16739 + }, + { + "epoch": 1.8383483417526905, + "grad_norm": 2.1742665767669678, + "learning_rate": 1e-06, + "loss": 0.9457, + "mean_token_accuracy": 0.7122319936752319, + "num_tokens": 423976874.0, + "step": 16740 + }, + { + "epoch": 1.8384581594553042, + "grad_norm": 2.1334993839263916, + "learning_rate": 1e-06, + "loss": 0.8951, + "mean_token_accuracy": 0.7256374359130859, + "num_tokens": 424004976.0, + "step": 16741 + }, + { + "epoch": 1.838567977157918, + "grad_norm": 2.371548652648926, + "learning_rate": 1e-06, + "loss": 0.8055, + "mean_token_accuracy": 0.7413039207458496, + "num_tokens": 424026241.0, + "step": 16742 + }, + { + "epoch": 1.8386777948605315, + "grad_norm": 2.455777406692505, + "learning_rate": 1e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.7060025930404663, + "num_tokens": 424049210.0, + "step": 16743 + }, + { + "epoch": 1.838787612563145, + "grad_norm": 2.4404776096343994, + "learning_rate": 1e-06, + "loss": 0.8739, + "mean_token_accuracy": 0.7252764701843262, + "num_tokens": 424071093.0, + "step": 16744 + }, + { + "epoch": 1.8388974302657588, + "grad_norm": 3.075409173965454, + "learning_rate": 1e-06, + "loss": 0.7664, + "mean_token_accuracy": 0.753597617149353, + "num_tokens": 424086354.0, + "step": 16745 + }, + { + "epoch": 1.8390072479683726, + "grad_norm": 2.2878687381744385, + "learning_rate": 1e-06, + "loss": 1.0004, + "mean_token_accuracy": 0.6959226131439209, + "num_tokens": 424115023.0, + "step": 16746 + }, + { + "epoch": 1.8391170656709863, + "grad_norm": 2.138063907623291, + "learning_rate": 1e-06, + "loss": 0.8591, + "mean_token_accuracy": 0.7250324487686157, + "num_tokens": 424140935.0, + "step": 16747 + }, + { + "epoch": 1.8392268833735999, + "grad_norm": 2.0818421840667725, + "learning_rate": 1e-06, + "loss": 0.8969, + "mean_token_accuracy": 0.7279888391494751, + "num_tokens": 424171369.0, + "step": 16748 + }, + { + "epoch": 1.8393367010762134, + "grad_norm": 2.214292049407959, + "learning_rate": 1e-06, + "loss": 0.846, + "mean_token_accuracy": 0.731164276599884, + "num_tokens": 424197155.0, + "step": 16749 + }, + { + "epoch": 1.8394465187788271, + "grad_norm": 2.7745397090911865, + "learning_rate": 1e-06, + "loss": 0.773, + "mean_token_accuracy": 0.7491222620010376, + "num_tokens": 424213618.0, + "step": 16750 + }, + { + "epoch": 1.839556336481441, + "grad_norm": 2.081624746322632, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.6973693370819092, + "num_tokens": 424244199.0, + "step": 16751 + }, + { + "epoch": 1.8396661541840544, + "grad_norm": 2.618861436843872, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7069816589355469, + "num_tokens": 424271108.0, + "step": 16752 + }, + { + "epoch": 1.839775971886668, + "grad_norm": 2.284254312515259, + "learning_rate": 1e-06, + "loss": 0.9172, + "mean_token_accuracy": 0.714083194732666, + "num_tokens": 424296987.0, + "step": 16753 + }, + { + "epoch": 1.8398857895892817, + "grad_norm": 2.3715476989746094, + "learning_rate": 1e-06, + "loss": 0.8177, + "mean_token_accuracy": 0.7393331527709961, + "num_tokens": 424320309.0, + "step": 16754 + }, + { + "epoch": 1.8399956072918955, + "grad_norm": 2.5966837406158447, + "learning_rate": 1e-06, + "loss": 0.8217, + "mean_token_accuracy": 0.7397778034210205, + "num_tokens": 424339327.0, + "step": 16755 + }, + { + "epoch": 1.8401054249945092, + "grad_norm": 2.292051315307617, + "learning_rate": 1e-06, + "loss": 0.8406, + "mean_token_accuracy": 0.7336349487304688, + "num_tokens": 424361949.0, + "step": 16756 + }, + { + "epoch": 1.8402152426971228, + "grad_norm": 2.1382689476013184, + "learning_rate": 1e-06, + "loss": 0.8757, + "mean_token_accuracy": 0.7253960371017456, + "num_tokens": 424388045.0, + "step": 16757 + }, + { + "epoch": 1.8403250603997363, + "grad_norm": 2.2665207386016846, + "learning_rate": 1e-06, + "loss": 0.9407, + "mean_token_accuracy": 0.7139458060264587, + "num_tokens": 424415219.0, + "step": 16758 + }, + { + "epoch": 1.84043487810235, + "grad_norm": 2.3492157459259033, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7201324105262756, + "num_tokens": 424439398.0, + "step": 16759 + }, + { + "epoch": 1.8405446958049638, + "grad_norm": 2.3571300506591797, + "learning_rate": 1e-06, + "loss": 0.9262, + "mean_token_accuracy": 0.7087761759757996, + "num_tokens": 424462557.0, + "step": 16760 + }, + { + "epoch": 1.8406545135075776, + "grad_norm": 1.9254264831542969, + "learning_rate": 1e-06, + "loss": 0.9158, + "mean_token_accuracy": 0.7125406265258789, + "num_tokens": 424495422.0, + "step": 16761 + }, + { + "epoch": 1.8407643312101911, + "grad_norm": 2.264052152633667, + "learning_rate": 1e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.7133052349090576, + "num_tokens": 424521193.0, + "step": 16762 + }, + { + "epoch": 1.8408741489128047, + "grad_norm": 2.156179666519165, + "learning_rate": 1e-06, + "loss": 0.8889, + "mean_token_accuracy": 0.7220545411109924, + "num_tokens": 424548284.0, + "step": 16763 + }, + { + "epoch": 1.8409839666154184, + "grad_norm": 2.2601852416992188, + "learning_rate": 1e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.7205763459205627, + "num_tokens": 424575118.0, + "step": 16764 + }, + { + "epoch": 1.8410937843180322, + "grad_norm": 2.4688222408294678, + "learning_rate": 1e-06, + "loss": 0.8921, + "mean_token_accuracy": 0.7420378923416138, + "num_tokens": 424597501.0, + "step": 16765 + }, + { + "epoch": 1.8412036020206457, + "grad_norm": 2.1538538932800293, + "learning_rate": 1e-06, + "loss": 0.868, + "mean_token_accuracy": 0.7231786251068115, + "num_tokens": 424623894.0, + "step": 16766 + }, + { + "epoch": 1.8413134197232592, + "grad_norm": 1.9756574630737305, + "learning_rate": 1e-06, + "loss": 0.8452, + "mean_token_accuracy": 0.7329548597335815, + "num_tokens": 424656030.0, + "step": 16767 + }, + { + "epoch": 1.841423237425873, + "grad_norm": 2.410322904586792, + "learning_rate": 1e-06, + "loss": 0.8739, + "mean_token_accuracy": 0.7269641757011414, + "num_tokens": 424676517.0, + "step": 16768 + }, + { + "epoch": 1.8415330551284868, + "grad_norm": 2.1111903190612793, + "learning_rate": 1e-06, + "loss": 0.9049, + "mean_token_accuracy": 0.7142763733863831, + "num_tokens": 424705971.0, + "step": 16769 + }, + { + "epoch": 1.8416428728311005, + "grad_norm": 2.0550012588500977, + "learning_rate": 1e-06, + "loss": 0.912, + "mean_token_accuracy": 0.7104101181030273, + "num_tokens": 424737387.0, + "step": 16770 + }, + { + "epoch": 1.841752690533714, + "grad_norm": 2.221778631210327, + "learning_rate": 1e-06, + "loss": 0.8975, + "mean_token_accuracy": 0.7262177467346191, + "num_tokens": 424763737.0, + "step": 16771 + }, + { + "epoch": 1.8418625082363276, + "grad_norm": 2.4900319576263428, + "learning_rate": 1e-06, + "loss": 0.9027, + "mean_token_accuracy": 0.7201142907142639, + "num_tokens": 424785701.0, + "step": 16772 + }, + { + "epoch": 1.8419723259389413, + "grad_norm": 2.365845203399658, + "learning_rate": 1e-06, + "loss": 0.9511, + "mean_token_accuracy": 0.7058395743370056, + "num_tokens": 424812115.0, + "step": 16773 + }, + { + "epoch": 1.842082143641555, + "grad_norm": 2.1648433208465576, + "learning_rate": 1e-06, + "loss": 0.7445, + "mean_token_accuracy": 0.7536091804504395, + "num_tokens": 424836325.0, + "step": 16774 + }, + { + "epoch": 1.8421919613441686, + "grad_norm": 1.9908455610275269, + "learning_rate": 1e-06, + "loss": 0.9219, + "mean_token_accuracy": 0.7180740833282471, + "num_tokens": 424866501.0, + "step": 16775 + }, + { + "epoch": 1.8423017790467824, + "grad_norm": 2.4353137016296387, + "learning_rate": 1e-06, + "loss": 0.7439, + "mean_token_accuracy": 0.7586134672164917, + "num_tokens": 424889128.0, + "step": 16776 + }, + { + "epoch": 1.842411596749396, + "grad_norm": 2.0338518619537354, + "learning_rate": 1e-06, + "loss": 0.8777, + "mean_token_accuracy": 0.7217627763748169, + "num_tokens": 424918192.0, + "step": 16777 + }, + { + "epoch": 1.8425214144520097, + "grad_norm": 2.211409568786621, + "learning_rate": 1e-06, + "loss": 0.8312, + "mean_token_accuracy": 0.7317359447479248, + "num_tokens": 424943341.0, + "step": 16778 + }, + { + "epoch": 1.8426312321546234, + "grad_norm": 2.3809726238250732, + "learning_rate": 1e-06, + "loss": 0.9143, + "mean_token_accuracy": 0.7126376628875732, + "num_tokens": 424969171.0, + "step": 16779 + }, + { + "epoch": 1.842741049857237, + "grad_norm": 2.2833869457244873, + "learning_rate": 1e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7291960716247559, + "num_tokens": 424994626.0, + "step": 16780 + }, + { + "epoch": 1.8428508675598505, + "grad_norm": 2.056391716003418, + "learning_rate": 1e-06, + "loss": 0.8566, + "mean_token_accuracy": 0.7279844284057617, + "num_tokens": 425022415.0, + "step": 16781 + }, + { + "epoch": 1.8429606852624643, + "grad_norm": 2.0954248905181885, + "learning_rate": 1e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.7045756578445435, + "num_tokens": 425049997.0, + "step": 16782 + }, + { + "epoch": 1.843070502965078, + "grad_norm": 2.2256085872650146, + "learning_rate": 1e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7188948392868042, + "num_tokens": 425076455.0, + "step": 16783 + }, + { + "epoch": 1.8431803206676918, + "grad_norm": 2.4500925540924072, + "learning_rate": 1e-06, + "loss": 0.8376, + "mean_token_accuracy": 0.7360038757324219, + "num_tokens": 425098135.0, + "step": 16784 + }, + { + "epoch": 1.8432901383703053, + "grad_norm": 2.530564069747925, + "learning_rate": 1e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.7377899289131165, + "num_tokens": 425120477.0, + "step": 16785 + }, + { + "epoch": 1.8433999560729188, + "grad_norm": 2.15204119682312, + "learning_rate": 1e-06, + "loss": 0.8878, + "mean_token_accuracy": 0.7222084999084473, + "num_tokens": 425147475.0, + "step": 16786 + }, + { + "epoch": 1.8435097737755326, + "grad_norm": 2.1258065700531006, + "learning_rate": 1e-06, + "loss": 0.9588, + "mean_token_accuracy": 0.6978342533111572, + "num_tokens": 425175307.0, + "step": 16787 + }, + { + "epoch": 1.8436195914781464, + "grad_norm": 2.267390489578247, + "learning_rate": 1e-06, + "loss": 0.8839, + "mean_token_accuracy": 0.728713870048523, + "num_tokens": 425199933.0, + "step": 16788 + }, + { + "epoch": 1.84372940918076, + "grad_norm": 2.22739315032959, + "learning_rate": 1e-06, + "loss": 0.8614, + "mean_token_accuracy": 0.7342227697372437, + "num_tokens": 425224638.0, + "step": 16789 + }, + { + "epoch": 1.8438392268833736, + "grad_norm": 2.286858558654785, + "learning_rate": 1e-06, + "loss": 0.8411, + "mean_token_accuracy": 0.7342873811721802, + "num_tokens": 425248740.0, + "step": 16790 + }, + { + "epoch": 1.8439490445859872, + "grad_norm": 2.2806453704833984, + "learning_rate": 1e-06, + "loss": 0.8254, + "mean_token_accuracy": 0.7382258176803589, + "num_tokens": 425274197.0, + "step": 16791 + }, + { + "epoch": 1.844058862288601, + "grad_norm": 2.2715327739715576, + "learning_rate": 1e-06, + "loss": 0.8564, + "mean_token_accuracy": 0.7278188467025757, + "num_tokens": 425297428.0, + "step": 16792 + }, + { + "epoch": 1.8441686799912147, + "grad_norm": 2.395703077316284, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7106382846832275, + "num_tokens": 425321179.0, + "step": 16793 + }, + { + "epoch": 1.8442784976938282, + "grad_norm": 2.36592435836792, + "learning_rate": 1e-06, + "loss": 0.8587, + "mean_token_accuracy": 0.7327094674110413, + "num_tokens": 425344667.0, + "step": 16794 + }, + { + "epoch": 1.8443883153964418, + "grad_norm": 2.051382064819336, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7087451219558716, + "num_tokens": 425374821.0, + "step": 16795 + }, + { + "epoch": 1.8444981330990555, + "grad_norm": 2.1580731868743896, + "learning_rate": 1e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.7005230188369751, + "num_tokens": 425401439.0, + "step": 16796 + }, + { + "epoch": 1.8446079508016693, + "grad_norm": 2.008310556411743, + "learning_rate": 1e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.7151572704315186, + "num_tokens": 425431044.0, + "step": 16797 + }, + { + "epoch": 1.844717768504283, + "grad_norm": 2.1339778900146484, + "learning_rate": 1e-06, + "loss": 0.9775, + "mean_token_accuracy": 0.7022780179977417, + "num_tokens": 425461614.0, + "step": 16798 + }, + { + "epoch": 1.8448275862068966, + "grad_norm": 2.468380928039551, + "learning_rate": 1e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.7210990190505981, + "num_tokens": 425482742.0, + "step": 16799 + }, + { + "epoch": 1.84493740390951, + "grad_norm": 2.1606905460357666, + "learning_rate": 1e-06, + "loss": 1.0139, + "mean_token_accuracy": 0.6937997341156006, + "num_tokens": 425511185.0, + "step": 16800 + }, + { + "epoch": 1.8450472216121239, + "grad_norm": 2.439429759979248, + "learning_rate": 1e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.6955879330635071, + "num_tokens": 425534188.0, + "step": 16801 + }, + { + "epoch": 1.8451570393147376, + "grad_norm": 2.420804738998413, + "learning_rate": 1e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.7249079346656799, + "num_tokens": 425555823.0, + "step": 16802 + }, + { + "epoch": 1.8452668570173512, + "grad_norm": 2.436305522918701, + "learning_rate": 1e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.7196938395500183, + "num_tokens": 425578317.0, + "step": 16803 + }, + { + "epoch": 1.8453766747199647, + "grad_norm": 2.2189741134643555, + "learning_rate": 1e-06, + "loss": 0.9237, + "mean_token_accuracy": 0.7233502268791199, + "num_tokens": 425605096.0, + "step": 16804 + }, + { + "epoch": 1.8454864924225785, + "grad_norm": 2.17146372795105, + "learning_rate": 1e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.7127062082290649, + "num_tokens": 425631310.0, + "step": 16805 + }, + { + "epoch": 1.8455963101251922, + "grad_norm": 2.018866539001465, + "learning_rate": 1e-06, + "loss": 0.9011, + "mean_token_accuracy": 0.7151424884796143, + "num_tokens": 425661811.0, + "step": 16806 + }, + { + "epoch": 1.845706127827806, + "grad_norm": 2.275491714477539, + "learning_rate": 1e-06, + "loss": 0.8349, + "mean_token_accuracy": 0.7354758977890015, + "num_tokens": 425685876.0, + "step": 16807 + }, + { + "epoch": 1.8458159455304195, + "grad_norm": 2.2531559467315674, + "learning_rate": 1e-06, + "loss": 0.8528, + "mean_token_accuracy": 0.7377222180366516, + "num_tokens": 425712160.0, + "step": 16808 + }, + { + "epoch": 1.845925763233033, + "grad_norm": 2.060462713241577, + "learning_rate": 1e-06, + "loss": 0.8841, + "mean_token_accuracy": 0.7246755361557007, + "num_tokens": 425740236.0, + "step": 16809 + }, + { + "epoch": 1.8460355809356468, + "grad_norm": 2.075932264328003, + "learning_rate": 1e-06, + "loss": 0.9348, + "mean_token_accuracy": 0.7088962197303772, + "num_tokens": 425769313.0, + "step": 16810 + }, + { + "epoch": 1.8461453986382605, + "grad_norm": 2.694395065307617, + "learning_rate": 1e-06, + "loss": 0.782, + "mean_token_accuracy": 0.7475953102111816, + "num_tokens": 425788142.0, + "step": 16811 + }, + { + "epoch": 1.8462552163408743, + "grad_norm": 2.102241039276123, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.71341872215271, + "num_tokens": 425815847.0, + "step": 16812 + }, + { + "epoch": 1.8463650340434878, + "grad_norm": 2.3253018856048584, + "learning_rate": 1e-06, + "loss": 0.8526, + "mean_token_accuracy": 0.7285747528076172, + "num_tokens": 425836832.0, + "step": 16813 + }, + { + "epoch": 1.8464748517461014, + "grad_norm": 2.299999237060547, + "learning_rate": 1e-06, + "loss": 0.9274, + "mean_token_accuracy": 0.7139350771903992, + "num_tokens": 425862202.0, + "step": 16814 + }, + { + "epoch": 1.8465846694487151, + "grad_norm": 2.279534339904785, + "learning_rate": 1e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.7142692804336548, + "num_tokens": 425888904.0, + "step": 16815 + }, + { + "epoch": 1.846694487151329, + "grad_norm": 2.1179771423339844, + "learning_rate": 1e-06, + "loss": 0.9041, + "mean_token_accuracy": 0.7278576493263245, + "num_tokens": 425915012.0, + "step": 16816 + }, + { + "epoch": 1.8468043048539424, + "grad_norm": 2.1227304935455322, + "learning_rate": 1e-06, + "loss": 0.8603, + "mean_token_accuracy": 0.7300131320953369, + "num_tokens": 425944698.0, + "step": 16817 + }, + { + "epoch": 1.846914122556556, + "grad_norm": 2.765366315841675, + "learning_rate": 1e-06, + "loss": 0.8703, + "mean_token_accuracy": 0.7300719022750854, + "num_tokens": 425962181.0, + "step": 16818 + }, + { + "epoch": 1.8470239402591697, + "grad_norm": 2.18357253074646, + "learning_rate": 1e-06, + "loss": 0.8879, + "mean_token_accuracy": 0.7247588634490967, + "num_tokens": 425987906.0, + "step": 16819 + }, + { + "epoch": 1.8471337579617835, + "grad_norm": 2.547525405883789, + "learning_rate": 1e-06, + "loss": 0.8372, + "mean_token_accuracy": 0.7313666939735413, + "num_tokens": 426008769.0, + "step": 16820 + }, + { + "epoch": 1.8472435756643972, + "grad_norm": 2.320387125015259, + "learning_rate": 1e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.7082070112228394, + "num_tokens": 426034449.0, + "step": 16821 + }, + { + "epoch": 1.8473533933670108, + "grad_norm": 2.197831153869629, + "learning_rate": 1e-06, + "loss": 0.8269, + "mean_token_accuracy": 0.7404836416244507, + "num_tokens": 426058857.0, + "step": 16822 + }, + { + "epoch": 1.8474632110696243, + "grad_norm": 2.007786273956299, + "learning_rate": 1e-06, + "loss": 0.8635, + "mean_token_accuracy": 0.7218992710113525, + "num_tokens": 426088242.0, + "step": 16823 + }, + { + "epoch": 1.847573028772238, + "grad_norm": 2.1556689739227295, + "learning_rate": 1e-06, + "loss": 0.8035, + "mean_token_accuracy": 0.7360476851463318, + "num_tokens": 426114304.0, + "step": 16824 + }, + { + "epoch": 1.8476828464748518, + "grad_norm": 1.9981104135513306, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7029391527175903, + "num_tokens": 426146253.0, + "step": 16825 + }, + { + "epoch": 1.8477926641774656, + "grad_norm": 2.0374743938446045, + "learning_rate": 1e-06, + "loss": 0.8454, + "mean_token_accuracy": 0.738624095916748, + "num_tokens": 426174890.0, + "step": 16826 + }, + { + "epoch": 1.847902481880079, + "grad_norm": 2.192781925201416, + "learning_rate": 1e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.7190560102462769, + "num_tokens": 426200119.0, + "step": 16827 + }, + { + "epoch": 1.8480122995826926, + "grad_norm": 2.595884323120117, + "learning_rate": 1e-06, + "loss": 0.8448, + "mean_token_accuracy": 0.7355817556381226, + "num_tokens": 426220155.0, + "step": 16828 + }, + { + "epoch": 1.8481221172853064, + "grad_norm": 2.0859439373016357, + "learning_rate": 1e-06, + "loss": 0.8373, + "mean_token_accuracy": 0.7337074279785156, + "num_tokens": 426246455.0, + "step": 16829 + }, + { + "epoch": 1.8482319349879202, + "grad_norm": 2.460514545440674, + "learning_rate": 1e-06, + "loss": 0.8311, + "mean_token_accuracy": 0.7374321222305298, + "num_tokens": 426269320.0, + "step": 16830 + }, + { + "epoch": 1.8483417526905337, + "grad_norm": 1.986388921737671, + "learning_rate": 1e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.7103474736213684, + "num_tokens": 426301359.0, + "step": 16831 + }, + { + "epoch": 1.8484515703931472, + "grad_norm": 2.228182077407837, + "learning_rate": 1e-06, + "loss": 0.8883, + "mean_token_accuracy": 0.7199993133544922, + "num_tokens": 426326406.0, + "step": 16832 + }, + { + "epoch": 1.848561388095761, + "grad_norm": 2.1120426654815674, + "learning_rate": 1e-06, + "loss": 1.062, + "mean_token_accuracy": 0.6785293221473694, + "num_tokens": 426356531.0, + "step": 16833 + }, + { + "epoch": 1.8486712057983747, + "grad_norm": 2.136863946914673, + "learning_rate": 1e-06, + "loss": 0.8769, + "mean_token_accuracy": 0.731669545173645, + "num_tokens": 426382756.0, + "step": 16834 + }, + { + "epoch": 1.8487810235009885, + "grad_norm": 2.2555031776428223, + "learning_rate": 1e-06, + "loss": 0.9202, + "mean_token_accuracy": 0.7189935445785522, + "num_tokens": 426407941.0, + "step": 16835 + }, + { + "epoch": 1.848890841203602, + "grad_norm": 2.0617053508758545, + "learning_rate": 1e-06, + "loss": 0.9887, + "mean_token_accuracy": 0.6981021165847778, + "num_tokens": 426439242.0, + "step": 16836 + }, + { + "epoch": 1.8490006589062156, + "grad_norm": 2.2109827995300293, + "learning_rate": 1e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.7031702995300293, + "num_tokens": 426467227.0, + "step": 16837 + }, + { + "epoch": 1.8491104766088293, + "grad_norm": 2.566194772720337, + "learning_rate": 1e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.7321590185165405, + "num_tokens": 426487982.0, + "step": 16838 + }, + { + "epoch": 1.849220294311443, + "grad_norm": 2.3429675102233887, + "learning_rate": 1e-06, + "loss": 0.8675, + "mean_token_accuracy": 0.7250255346298218, + "num_tokens": 426513466.0, + "step": 16839 + }, + { + "epoch": 1.8493301120140566, + "grad_norm": 1.9081919193267822, + "learning_rate": 1e-06, + "loss": 0.9751, + "mean_token_accuracy": 0.7029378414154053, + "num_tokens": 426548129.0, + "step": 16840 + }, + { + "epoch": 1.8494399297166704, + "grad_norm": 2.5588722229003906, + "learning_rate": 1e-06, + "loss": 0.7869, + "mean_token_accuracy": 0.7504825592041016, + "num_tokens": 426567468.0, + "step": 16841 + }, + { + "epoch": 1.849549747419284, + "grad_norm": 1.9241195917129517, + "learning_rate": 1e-06, + "loss": 0.9809, + "mean_token_accuracy": 0.6988179683685303, + "num_tokens": 426604090.0, + "step": 16842 + }, + { + "epoch": 1.8496595651218977, + "grad_norm": 2.3710503578186035, + "learning_rate": 1e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7211632132530212, + "num_tokens": 426625808.0, + "step": 16843 + }, + { + "epoch": 1.8497693828245114, + "grad_norm": 2.5515329837799072, + "learning_rate": 1e-06, + "loss": 0.8343, + "mean_token_accuracy": 0.7359805107116699, + "num_tokens": 426645805.0, + "step": 16844 + }, + { + "epoch": 1.849879200527125, + "grad_norm": 1.9740073680877686, + "learning_rate": 1e-06, + "loss": 0.8793, + "mean_token_accuracy": 0.7278136014938354, + "num_tokens": 426676222.0, + "step": 16845 + }, + { + "epoch": 1.8499890182297385, + "grad_norm": 2.137751340866089, + "learning_rate": 1e-06, + "loss": 0.9894, + "mean_token_accuracy": 0.6983808279037476, + "num_tokens": 426703847.0, + "step": 16846 + }, + { + "epoch": 1.8500988359323522, + "grad_norm": 2.181837797164917, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.7011911869049072, + "num_tokens": 426733537.0, + "step": 16847 + }, + { + "epoch": 1.850208653634966, + "grad_norm": 2.2482073307037354, + "learning_rate": 1e-06, + "loss": 0.8994, + "mean_token_accuracy": 0.7173941135406494, + "num_tokens": 426759840.0, + "step": 16848 + }, + { + "epoch": 1.8503184713375798, + "grad_norm": 2.4757378101348877, + "learning_rate": 1e-06, + "loss": 0.8676, + "mean_token_accuracy": 0.725967288017273, + "num_tokens": 426781151.0, + "step": 16849 + }, + { + "epoch": 1.8504282890401933, + "grad_norm": 2.1370623111724854, + "learning_rate": 1e-06, + "loss": 0.865, + "mean_token_accuracy": 0.7259795665740967, + "num_tokens": 426808270.0, + "step": 16850 + }, + { + "epoch": 1.8505381067428068, + "grad_norm": 2.051424264907837, + "learning_rate": 1e-06, + "loss": 0.8807, + "mean_token_accuracy": 0.7206943035125732, + "num_tokens": 426836363.0, + "step": 16851 + }, + { + "epoch": 1.8506479244454206, + "grad_norm": 2.4012956619262695, + "learning_rate": 1e-06, + "loss": 0.816, + "mean_token_accuracy": 0.7402889728546143, + "num_tokens": 426857811.0, + "step": 16852 + }, + { + "epoch": 1.8507577421480343, + "grad_norm": 2.338752031326294, + "learning_rate": 1e-06, + "loss": 0.888, + "mean_token_accuracy": 0.7235733270645142, + "num_tokens": 426880633.0, + "step": 16853 + }, + { + "epoch": 1.8508675598506479, + "grad_norm": 2.3013713359832764, + "learning_rate": 1e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.7213715314865112, + "num_tokens": 426905889.0, + "step": 16854 + }, + { + "epoch": 1.8509773775532616, + "grad_norm": 2.391484498977661, + "learning_rate": 1e-06, + "loss": 0.9525, + "mean_token_accuracy": 0.7089536190032959, + "num_tokens": 426930434.0, + "step": 16855 + }, + { + "epoch": 1.8510871952558752, + "grad_norm": 2.2178945541381836, + "learning_rate": 1e-06, + "loss": 0.8986, + "mean_token_accuracy": 0.7177466750144958, + "num_tokens": 426958370.0, + "step": 16856 + }, + { + "epoch": 1.851197012958489, + "grad_norm": 2.1429708003997803, + "learning_rate": 1e-06, + "loss": 0.8846, + "mean_token_accuracy": 0.725581705570221, + "num_tokens": 426984334.0, + "step": 16857 + }, + { + "epoch": 1.8513068306611027, + "grad_norm": 2.2499213218688965, + "learning_rate": 1e-06, + "loss": 0.9802, + "mean_token_accuracy": 0.697330892086029, + "num_tokens": 427010947.0, + "step": 16858 + }, + { + "epoch": 1.8514166483637162, + "grad_norm": 2.456437587738037, + "learning_rate": 1e-06, + "loss": 0.9051, + "mean_token_accuracy": 0.7182310223579407, + "num_tokens": 427033165.0, + "step": 16859 + }, + { + "epoch": 1.8515264660663298, + "grad_norm": 2.3806848526000977, + "learning_rate": 1e-06, + "loss": 0.8265, + "mean_token_accuracy": 0.7369049787521362, + "num_tokens": 427056237.0, + "step": 16860 + }, + { + "epoch": 1.8516362837689435, + "grad_norm": 2.338426113128662, + "learning_rate": 1e-06, + "loss": 0.8219, + "mean_token_accuracy": 0.7385362386703491, + "num_tokens": 427079786.0, + "step": 16861 + }, + { + "epoch": 1.8517461014715573, + "grad_norm": 2.329406499862671, + "learning_rate": 1e-06, + "loss": 0.8424, + "mean_token_accuracy": 0.7285112142562866, + "num_tokens": 427103469.0, + "step": 16862 + }, + { + "epoch": 1.851855919174171, + "grad_norm": 2.107900857925415, + "learning_rate": 1e-06, + "loss": 0.8992, + "mean_token_accuracy": 0.7235394716262817, + "num_tokens": 427130002.0, + "step": 16863 + }, + { + "epoch": 1.8519657368767846, + "grad_norm": 2.3618428707122803, + "learning_rate": 1e-06, + "loss": 0.8585, + "mean_token_accuracy": 0.7426356673240662, + "num_tokens": 427151388.0, + "step": 16864 + }, + { + "epoch": 1.852075554579398, + "grad_norm": 1.9378587007522583, + "learning_rate": 1e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.710116982460022, + "num_tokens": 427182116.0, + "step": 16865 + }, + { + "epoch": 1.8521853722820119, + "grad_norm": 2.2692863941192627, + "learning_rate": 1e-06, + "loss": 0.9381, + "mean_token_accuracy": 0.7080853581428528, + "num_tokens": 427207706.0, + "step": 16866 + }, + { + "epoch": 1.8522951899846256, + "grad_norm": 1.9754014015197754, + "learning_rate": 1e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7154793739318848, + "num_tokens": 427240092.0, + "step": 16867 + }, + { + "epoch": 1.8524050076872391, + "grad_norm": 2.2051355838775635, + "learning_rate": 1e-06, + "loss": 0.9236, + "mean_token_accuracy": 0.7243388891220093, + "num_tokens": 427265859.0, + "step": 16868 + }, + { + "epoch": 1.8525148253898527, + "grad_norm": 2.236987590789795, + "learning_rate": 1e-06, + "loss": 0.905, + "mean_token_accuracy": 0.7195333242416382, + "num_tokens": 427293163.0, + "step": 16869 + }, + { + "epoch": 1.8526246430924664, + "grad_norm": 2.0160934925079346, + "learning_rate": 1e-06, + "loss": 0.8466, + "mean_token_accuracy": 0.741441547870636, + "num_tokens": 427321708.0, + "step": 16870 + }, + { + "epoch": 1.8527344607950802, + "grad_norm": 2.2855935096740723, + "learning_rate": 1e-06, + "loss": 0.8971, + "mean_token_accuracy": 0.7230849266052246, + "num_tokens": 427346388.0, + "step": 16871 + }, + { + "epoch": 1.852844278497694, + "grad_norm": 2.1374545097351074, + "learning_rate": 1e-06, + "loss": 0.876, + "mean_token_accuracy": 0.7225257754325867, + "num_tokens": 427375784.0, + "step": 16872 + }, + { + "epoch": 1.8529540962003075, + "grad_norm": 2.2986860275268555, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7229076027870178, + "num_tokens": 427400386.0, + "step": 16873 + }, + { + "epoch": 1.853063913902921, + "grad_norm": 2.351393938064575, + "learning_rate": 1e-06, + "loss": 0.9054, + "mean_token_accuracy": 0.718124270439148, + "num_tokens": 427425273.0, + "step": 16874 + }, + { + "epoch": 1.8531737316055348, + "grad_norm": 2.2433483600616455, + "learning_rate": 1e-06, + "loss": 0.9244, + "mean_token_accuracy": 0.7179276347160339, + "num_tokens": 427451740.0, + "step": 16875 + }, + { + "epoch": 1.8532835493081485, + "grad_norm": 2.313041925430298, + "learning_rate": 1e-06, + "loss": 0.8561, + "mean_token_accuracy": 0.7307958602905273, + "num_tokens": 427474547.0, + "step": 16876 + }, + { + "epoch": 1.8533933670107623, + "grad_norm": 2.3518519401550293, + "learning_rate": 1e-06, + "loss": 0.984, + "mean_token_accuracy": 0.7066315412521362, + "num_tokens": 427499313.0, + "step": 16877 + }, + { + "epoch": 1.8535031847133758, + "grad_norm": 2.169543743133545, + "learning_rate": 1e-06, + "loss": 0.7572, + "mean_token_accuracy": 0.7584310173988342, + "num_tokens": 427524600.0, + "step": 16878 + }, + { + "epoch": 1.8536130024159894, + "grad_norm": 2.008641242980957, + "learning_rate": 1e-06, + "loss": 0.8488, + "mean_token_accuracy": 0.7340761423110962, + "num_tokens": 427554107.0, + "step": 16879 + }, + { + "epoch": 1.8537228201186031, + "grad_norm": 2.433257579803467, + "learning_rate": 1e-06, + "loss": 0.8032, + "mean_token_accuracy": 0.7498663663864136, + "num_tokens": 427576955.0, + "step": 16880 + }, + { + "epoch": 1.8538326378212169, + "grad_norm": 2.274729013442993, + "learning_rate": 1e-06, + "loss": 0.971, + "mean_token_accuracy": 0.6976036429405212, + "num_tokens": 427601913.0, + "step": 16881 + }, + { + "epoch": 1.8539424555238304, + "grad_norm": 2.1278014183044434, + "learning_rate": 1e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7172360420227051, + "num_tokens": 427629359.0, + "step": 16882 + }, + { + "epoch": 1.854052273226444, + "grad_norm": 2.148430347442627, + "learning_rate": 1e-06, + "loss": 0.907, + "mean_token_accuracy": 0.7233290672302246, + "num_tokens": 427656594.0, + "step": 16883 + }, + { + "epoch": 1.8541620909290577, + "grad_norm": 2.4963440895080566, + "learning_rate": 1e-06, + "loss": 0.8373, + "mean_token_accuracy": 0.7390686273574829, + "num_tokens": 427677291.0, + "step": 16884 + }, + { + "epoch": 1.8542719086316715, + "grad_norm": 2.4860196113586426, + "learning_rate": 1e-06, + "loss": 0.7845, + "mean_token_accuracy": 0.7445545196533203, + "num_tokens": 427697293.0, + "step": 16885 + }, + { + "epoch": 1.8543817263342852, + "grad_norm": 2.465627431869507, + "learning_rate": 1e-06, + "loss": 0.8389, + "mean_token_accuracy": 0.7406869530677795, + "num_tokens": 427718271.0, + "step": 16886 + }, + { + "epoch": 1.8544915440368988, + "grad_norm": 2.3091046810150146, + "learning_rate": 1e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.7156479358673096, + "num_tokens": 427741593.0, + "step": 16887 + }, + { + "epoch": 1.8546013617395123, + "grad_norm": 2.215914487838745, + "learning_rate": 1e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.7095160484313965, + "num_tokens": 427769962.0, + "step": 16888 + }, + { + "epoch": 1.854711179442126, + "grad_norm": 2.248337984085083, + "learning_rate": 1e-06, + "loss": 0.8976, + "mean_token_accuracy": 0.7178138494491577, + "num_tokens": 427795713.0, + "step": 16889 + }, + { + "epoch": 1.8548209971447398, + "grad_norm": 2.5179824829101562, + "learning_rate": 1e-06, + "loss": 0.826, + "mean_token_accuracy": 0.7417163848876953, + "num_tokens": 427815957.0, + "step": 16890 + }, + { + "epoch": 1.8549308148473536, + "grad_norm": 2.164547920227051, + "learning_rate": 1e-06, + "loss": 1.023, + "mean_token_accuracy": 0.6970838308334351, + "num_tokens": 427845415.0, + "step": 16891 + }, + { + "epoch": 1.855040632549967, + "grad_norm": 2.4906322956085205, + "learning_rate": 1e-06, + "loss": 0.8628, + "mean_token_accuracy": 0.7339832782745361, + "num_tokens": 427864834.0, + "step": 16892 + }, + { + "epoch": 1.8551504502525806, + "grad_norm": 2.549807071685791, + "learning_rate": 1e-06, + "loss": 0.912, + "mean_token_accuracy": 0.7147932648658752, + "num_tokens": 427886986.0, + "step": 16893 + }, + { + "epoch": 1.8552602679551944, + "grad_norm": 2.208956718444824, + "learning_rate": 1e-06, + "loss": 1.0001, + "mean_token_accuracy": 0.6908977627754211, + "num_tokens": 427915282.0, + "step": 16894 + }, + { + "epoch": 1.8553700856578081, + "grad_norm": 2.4894509315490723, + "learning_rate": 1e-06, + "loss": 0.8526, + "mean_token_accuracy": 0.7286036610603333, + "num_tokens": 427937186.0, + "step": 16895 + }, + { + "epoch": 1.8554799033604217, + "grad_norm": 2.3029677867889404, + "learning_rate": 1e-06, + "loss": 0.9025, + "mean_token_accuracy": 0.726784348487854, + "num_tokens": 427961549.0, + "step": 16896 + }, + { + "epoch": 1.8555897210630352, + "grad_norm": 2.259777784347534, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7101219892501831, + "num_tokens": 427987723.0, + "step": 16897 + }, + { + "epoch": 1.855699538765649, + "grad_norm": 2.394804000854492, + "learning_rate": 1e-06, + "loss": 0.9096, + "mean_token_accuracy": 0.7350694537162781, + "num_tokens": 428010030.0, + "step": 16898 + }, + { + "epoch": 1.8558093564682627, + "grad_norm": 2.230929374694824, + "learning_rate": 1e-06, + "loss": 1.0088, + "mean_token_accuracy": 0.6917181015014648, + "num_tokens": 428038487.0, + "step": 16899 + }, + { + "epoch": 1.8559191741708765, + "grad_norm": 2.3364505767822266, + "learning_rate": 1e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.7069728374481201, + "num_tokens": 428061477.0, + "step": 16900 + }, + { + "epoch": 1.85602899187349, + "grad_norm": 2.232877254486084, + "learning_rate": 1e-06, + "loss": 0.8642, + "mean_token_accuracy": 0.726642370223999, + "num_tokens": 428088545.0, + "step": 16901 + }, + { + "epoch": 1.8561388095761036, + "grad_norm": 2.3179523944854736, + "learning_rate": 1e-06, + "loss": 0.8824, + "mean_token_accuracy": 0.7220618724822998, + "num_tokens": 428111201.0, + "step": 16902 + }, + { + "epoch": 1.8562486272787173, + "grad_norm": 2.480933427810669, + "learning_rate": 1e-06, + "loss": 0.9607, + "mean_token_accuracy": 0.7014588713645935, + "num_tokens": 428135132.0, + "step": 16903 + }, + { + "epoch": 1.856358444981331, + "grad_norm": 2.0876684188842773, + "learning_rate": 1e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.7126380205154419, + "num_tokens": 428163653.0, + "step": 16904 + }, + { + "epoch": 1.8564682626839446, + "grad_norm": 2.0454845428466797, + "learning_rate": 1e-06, + "loss": 0.8805, + "mean_token_accuracy": 0.7251485586166382, + "num_tokens": 428194202.0, + "step": 16905 + }, + { + "epoch": 1.8565780803865584, + "grad_norm": 2.2475154399871826, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7098555564880371, + "num_tokens": 428220511.0, + "step": 16906 + }, + { + "epoch": 1.856687898089172, + "grad_norm": 2.3390023708343506, + "learning_rate": 1e-06, + "loss": 0.8481, + "mean_token_accuracy": 0.7309545278549194, + "num_tokens": 428244841.0, + "step": 16907 + }, + { + "epoch": 1.8567977157917857, + "grad_norm": 2.213139057159424, + "learning_rate": 1e-06, + "loss": 0.8305, + "mean_token_accuracy": 0.7361254096031189, + "num_tokens": 428271140.0, + "step": 16908 + }, + { + "epoch": 1.8569075334943994, + "grad_norm": 2.7237651348114014, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.703217625617981, + "num_tokens": 428291890.0, + "step": 16909 + }, + { + "epoch": 1.857017351197013, + "grad_norm": 1.9919971227645874, + "learning_rate": 1e-06, + "loss": 0.9008, + "mean_token_accuracy": 0.7159479856491089, + "num_tokens": 428323881.0, + "step": 16910 + }, + { + "epoch": 1.8571271688996265, + "grad_norm": 2.308143377304077, + "learning_rate": 1e-06, + "loss": 0.9008, + "mean_token_accuracy": 0.7310190200805664, + "num_tokens": 428349320.0, + "step": 16911 + }, + { + "epoch": 1.8572369866022402, + "grad_norm": 2.3732047080993652, + "learning_rate": 1e-06, + "loss": 0.816, + "mean_token_accuracy": 0.7377243041992188, + "num_tokens": 428371045.0, + "step": 16912 + }, + { + "epoch": 1.857346804304854, + "grad_norm": 2.2407588958740234, + "learning_rate": 1e-06, + "loss": 0.9628, + "mean_token_accuracy": 0.7073345184326172, + "num_tokens": 428397366.0, + "step": 16913 + }, + { + "epoch": 1.8574566220074678, + "grad_norm": 2.3783628940582275, + "learning_rate": 1e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.7191604375839233, + "num_tokens": 428420065.0, + "step": 16914 + }, + { + "epoch": 1.8575664397100813, + "grad_norm": 2.2179670333862305, + "learning_rate": 1e-06, + "loss": 0.8159, + "mean_token_accuracy": 0.7434772253036499, + "num_tokens": 428444944.0, + "step": 16915 + }, + { + "epoch": 1.8576762574126948, + "grad_norm": 2.51068377494812, + "learning_rate": 1e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.7177690267562866, + "num_tokens": 428467518.0, + "step": 16916 + }, + { + "epoch": 1.8577860751153086, + "grad_norm": 2.1147024631500244, + "learning_rate": 1e-06, + "loss": 0.9987, + "mean_token_accuracy": 0.6913439035415649, + "num_tokens": 428496493.0, + "step": 16917 + }, + { + "epoch": 1.8578958928179223, + "grad_norm": 2.1358132362365723, + "learning_rate": 1e-06, + "loss": 0.9473, + "mean_token_accuracy": 0.7056524753570557, + "num_tokens": 428527892.0, + "step": 16918 + }, + { + "epoch": 1.8580057105205359, + "grad_norm": 2.3517417907714844, + "learning_rate": 1e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.7025418281555176, + "num_tokens": 428551074.0, + "step": 16919 + }, + { + "epoch": 1.8581155282231496, + "grad_norm": 2.6028475761413574, + "learning_rate": 1e-06, + "loss": 0.8274, + "mean_token_accuracy": 0.7349619269371033, + "num_tokens": 428571143.0, + "step": 16920 + }, + { + "epoch": 1.8582253459257632, + "grad_norm": 2.094331741333008, + "learning_rate": 1e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.6910445690155029, + "num_tokens": 428601070.0, + "step": 16921 + }, + { + "epoch": 1.858335163628377, + "grad_norm": 2.1704437732696533, + "learning_rate": 1e-06, + "loss": 0.949, + "mean_token_accuracy": 0.7039035558700562, + "num_tokens": 428629823.0, + "step": 16922 + }, + { + "epoch": 1.8584449813309907, + "grad_norm": 2.855921506881714, + "learning_rate": 1e-06, + "loss": 0.8228, + "mean_token_accuracy": 0.7377108335494995, + "num_tokens": 428647411.0, + "step": 16923 + }, + { + "epoch": 1.8585547990336042, + "grad_norm": 2.271705150604248, + "learning_rate": 1e-06, + "loss": 0.8705, + "mean_token_accuracy": 0.7250498533248901, + "num_tokens": 428671770.0, + "step": 16924 + }, + { + "epoch": 1.8586646167362177, + "grad_norm": 2.036980390548706, + "learning_rate": 1e-06, + "loss": 0.8497, + "mean_token_accuracy": 0.7322473526000977, + "num_tokens": 428699998.0, + "step": 16925 + }, + { + "epoch": 1.8587744344388315, + "grad_norm": 1.9854637384414673, + "learning_rate": 1e-06, + "loss": 0.8961, + "mean_token_accuracy": 0.725091278553009, + "num_tokens": 428729887.0, + "step": 16926 + }, + { + "epoch": 1.8588842521414453, + "grad_norm": 2.18517804145813, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7159726619720459, + "num_tokens": 428755898.0, + "step": 16927 + }, + { + "epoch": 1.858994069844059, + "grad_norm": 2.374683141708374, + "learning_rate": 1e-06, + "loss": 0.8384, + "mean_token_accuracy": 0.7332558035850525, + "num_tokens": 428778792.0, + "step": 16928 + }, + { + "epoch": 1.8591038875466726, + "grad_norm": 2.1225674152374268, + "learning_rate": 1e-06, + "loss": 1.0041, + "mean_token_accuracy": 0.6869903802871704, + "num_tokens": 428809049.0, + "step": 16929 + }, + { + "epoch": 1.859213705249286, + "grad_norm": 2.3132643699645996, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.7053751945495605, + "num_tokens": 428833760.0, + "step": 16930 + }, + { + "epoch": 1.8593235229518998, + "grad_norm": 2.7517781257629395, + "learning_rate": 1e-06, + "loss": 0.8152, + "mean_token_accuracy": 0.7480679154396057, + "num_tokens": 428853242.0, + "step": 16931 + }, + { + "epoch": 1.8594333406545136, + "grad_norm": 2.5840516090393066, + "learning_rate": 1e-06, + "loss": 0.8728, + "mean_token_accuracy": 0.7259947061538696, + "num_tokens": 428874718.0, + "step": 16932 + }, + { + "epoch": 1.8595431583571271, + "grad_norm": 2.2315986156463623, + "learning_rate": 1e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.7190217971801758, + "num_tokens": 428900774.0, + "step": 16933 + }, + { + "epoch": 1.8596529760597407, + "grad_norm": 2.2072525024414062, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.7074416875839233, + "num_tokens": 428927345.0, + "step": 16934 + }, + { + "epoch": 1.8597627937623544, + "grad_norm": 2.218979835510254, + "learning_rate": 1e-06, + "loss": 0.7948, + "mean_token_accuracy": 0.7456693053245544, + "num_tokens": 428951694.0, + "step": 16935 + }, + { + "epoch": 1.8598726114649682, + "grad_norm": 2.180271863937378, + "learning_rate": 1e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.70458984375, + "num_tokens": 428978393.0, + "step": 16936 + }, + { + "epoch": 1.859982429167582, + "grad_norm": 2.428889751434326, + "learning_rate": 1e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.7210366725921631, + "num_tokens": 429001146.0, + "step": 16937 + }, + { + "epoch": 1.8600922468701955, + "grad_norm": 2.5155389308929443, + "learning_rate": 1e-06, + "loss": 0.8638, + "mean_token_accuracy": 0.7222356200218201, + "num_tokens": 429023031.0, + "step": 16938 + }, + { + "epoch": 1.860202064572809, + "grad_norm": 2.535068988800049, + "learning_rate": 1e-06, + "loss": 0.781, + "mean_token_accuracy": 0.7507770657539368, + "num_tokens": 429042552.0, + "step": 16939 + }, + { + "epoch": 1.8603118822754228, + "grad_norm": 2.060515880584717, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7169257402420044, + "num_tokens": 429070977.0, + "step": 16940 + }, + { + "epoch": 1.8604216999780365, + "grad_norm": 2.175520181655884, + "learning_rate": 1e-06, + "loss": 0.8931, + "mean_token_accuracy": 0.7170380353927612, + "num_tokens": 429096395.0, + "step": 16941 + }, + { + "epoch": 1.8605315176806503, + "grad_norm": 2.2820963859558105, + "learning_rate": 1e-06, + "loss": 0.861, + "mean_token_accuracy": 0.7366857528686523, + "num_tokens": 429119750.0, + "step": 16942 + }, + { + "epoch": 1.8606413353832638, + "grad_norm": 1.9398307800292969, + "learning_rate": 1e-06, + "loss": 0.8808, + "mean_token_accuracy": 0.724211573600769, + "num_tokens": 429151393.0, + "step": 16943 + }, + { + "epoch": 1.8607511530858774, + "grad_norm": 2.312648296356201, + "learning_rate": 1e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7320419549942017, + "num_tokens": 429175434.0, + "step": 16944 + }, + { + "epoch": 1.860860970788491, + "grad_norm": 2.210841417312622, + "learning_rate": 1e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.7188394069671631, + "num_tokens": 429202381.0, + "step": 16945 + }, + { + "epoch": 1.8609707884911049, + "grad_norm": 2.2449700832366943, + "learning_rate": 1e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7182351350784302, + "num_tokens": 429227959.0, + "step": 16946 + }, + { + "epoch": 1.8610806061937184, + "grad_norm": 2.227374792098999, + "learning_rate": 1e-06, + "loss": 0.9943, + "mean_token_accuracy": 0.7027611136436462, + "num_tokens": 429255731.0, + "step": 16947 + }, + { + "epoch": 1.861190423896332, + "grad_norm": 2.2627081871032715, + "learning_rate": 1e-06, + "loss": 0.8819, + "mean_token_accuracy": 0.724141538143158, + "num_tokens": 429280747.0, + "step": 16948 + }, + { + "epoch": 1.8613002415989457, + "grad_norm": 1.8738117218017578, + "learning_rate": 1e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7193834781646729, + "num_tokens": 429315627.0, + "step": 16949 + }, + { + "epoch": 1.8614100593015594, + "grad_norm": 2.032970905303955, + "learning_rate": 1e-06, + "loss": 0.975, + "mean_token_accuracy": 0.6988121271133423, + "num_tokens": 429346540.0, + "step": 16950 + }, + { + "epoch": 1.8615198770041732, + "grad_norm": 2.2425339221954346, + "learning_rate": 1e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.7205042839050293, + "num_tokens": 429371534.0, + "step": 16951 + }, + { + "epoch": 1.8616296947067867, + "grad_norm": 2.4049699306488037, + "learning_rate": 1e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7261300683021545, + "num_tokens": 429393327.0, + "step": 16952 + }, + { + "epoch": 1.8617395124094003, + "grad_norm": 2.317061424255371, + "learning_rate": 1e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7041885256767273, + "num_tokens": 429417246.0, + "step": 16953 + }, + { + "epoch": 1.861849330112014, + "grad_norm": 2.222490072250366, + "learning_rate": 1e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.7126253247261047, + "num_tokens": 429443243.0, + "step": 16954 + }, + { + "epoch": 1.8619591478146278, + "grad_norm": 2.4668948650360107, + "learning_rate": 1e-06, + "loss": 0.833, + "mean_token_accuracy": 0.7382887601852417, + "num_tokens": 429464214.0, + "step": 16955 + }, + { + "epoch": 1.8620689655172413, + "grad_norm": 2.0688083171844482, + "learning_rate": 1e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.706646203994751, + "num_tokens": 429493685.0, + "step": 16956 + }, + { + "epoch": 1.862178783219855, + "grad_norm": 2.0601439476013184, + "learning_rate": 1e-06, + "loss": 0.9643, + "mean_token_accuracy": 0.7020145058631897, + "num_tokens": 429521559.0, + "step": 16957 + }, + { + "epoch": 1.8622886009224686, + "grad_norm": 2.2014873027801514, + "learning_rate": 1e-06, + "loss": 1.0306, + "mean_token_accuracy": 0.6970356702804565, + "num_tokens": 429550973.0, + "step": 16958 + }, + { + "epoch": 1.8623984186250824, + "grad_norm": 2.130284547805786, + "learning_rate": 1e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7091962695121765, + "num_tokens": 429580544.0, + "step": 16959 + }, + { + "epoch": 1.8625082363276961, + "grad_norm": 1.9076505899429321, + "learning_rate": 1e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.7125234603881836, + "num_tokens": 429613281.0, + "step": 16960 + }, + { + "epoch": 1.8626180540303097, + "grad_norm": 2.2121546268463135, + "learning_rate": 1e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.7210586667060852, + "num_tokens": 429638039.0, + "step": 16961 + }, + { + "epoch": 1.8627278717329232, + "grad_norm": 2.487624168395996, + "learning_rate": 1e-06, + "loss": 0.9003, + "mean_token_accuracy": 0.714809775352478, + "num_tokens": 429659503.0, + "step": 16962 + }, + { + "epoch": 1.862837689435537, + "grad_norm": 2.2109599113464355, + "learning_rate": 1e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.691857635974884, + "num_tokens": 429686580.0, + "step": 16963 + }, + { + "epoch": 1.8629475071381507, + "grad_norm": 2.4575822353363037, + "learning_rate": 1e-06, + "loss": 0.9103, + "mean_token_accuracy": 0.7125318646430969, + "num_tokens": 429710378.0, + "step": 16964 + }, + { + "epoch": 1.8630573248407645, + "grad_norm": 2.070061445236206, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.7071892023086548, + "num_tokens": 429741180.0, + "step": 16965 + }, + { + "epoch": 1.863167142543378, + "grad_norm": 2.104761838912964, + "learning_rate": 1e-06, + "loss": 0.8335, + "mean_token_accuracy": 0.7330540418624878, + "num_tokens": 429768420.0, + "step": 16966 + }, + { + "epoch": 1.8632769602459915, + "grad_norm": 2.6913630962371826, + "learning_rate": 1e-06, + "loss": 0.871, + "mean_token_accuracy": 0.727112352848053, + "num_tokens": 429787552.0, + "step": 16967 + }, + { + "epoch": 1.8633867779486053, + "grad_norm": 2.230055332183838, + "learning_rate": 1e-06, + "loss": 0.98, + "mean_token_accuracy": 0.7100707292556763, + "num_tokens": 429815275.0, + "step": 16968 + }, + { + "epoch": 1.863496595651219, + "grad_norm": 1.9883222579956055, + "learning_rate": 1e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.6961163282394409, + "num_tokens": 429846055.0, + "step": 16969 + }, + { + "epoch": 1.8636064133538326, + "grad_norm": 2.0765857696533203, + "learning_rate": 1e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.695482611656189, + "num_tokens": 429876375.0, + "step": 16970 + }, + { + "epoch": 1.8637162310564463, + "grad_norm": 2.4116570949554443, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7193542718887329, + "num_tokens": 429898706.0, + "step": 16971 + }, + { + "epoch": 1.8638260487590599, + "grad_norm": 2.2459628582000732, + "learning_rate": 1e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.7280555367469788, + "num_tokens": 429923373.0, + "step": 16972 + }, + { + "epoch": 1.8639358664616736, + "grad_norm": 2.315856695175171, + "learning_rate": 1e-06, + "loss": 0.8766, + "mean_token_accuracy": 0.7311794757843018, + "num_tokens": 429946344.0, + "step": 16973 + }, + { + "epoch": 1.8640456841642874, + "grad_norm": 2.0995981693267822, + "learning_rate": 1e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.7069655656814575, + "num_tokens": 429975881.0, + "step": 16974 + }, + { + "epoch": 1.864155501866901, + "grad_norm": 2.0836164951324463, + "learning_rate": 1e-06, + "loss": 0.8866, + "mean_token_accuracy": 0.717863917350769, + "num_tokens": 430004795.0, + "step": 16975 + }, + { + "epoch": 1.8642653195695145, + "grad_norm": 2.0325207710266113, + "learning_rate": 1e-06, + "loss": 0.8778, + "mean_token_accuracy": 0.7205199003219604, + "num_tokens": 430032512.0, + "step": 16976 + }, + { + "epoch": 1.8643751372721282, + "grad_norm": 2.2660841941833496, + "learning_rate": 1e-06, + "loss": 0.8842, + "mean_token_accuracy": 0.723537266254425, + "num_tokens": 430055729.0, + "step": 16977 + }, + { + "epoch": 1.864484954974742, + "grad_norm": 2.3319122791290283, + "learning_rate": 1e-06, + "loss": 0.8987, + "mean_token_accuracy": 0.7288247346878052, + "num_tokens": 430079897.0, + "step": 16978 + }, + { + "epoch": 1.8645947726773557, + "grad_norm": 2.722674608230591, + "learning_rate": 1e-06, + "loss": 0.8425, + "mean_token_accuracy": 0.742687463760376, + "num_tokens": 430097803.0, + "step": 16979 + }, + { + "epoch": 1.8647045903799693, + "grad_norm": 2.1999733448028564, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7041370868682861, + "num_tokens": 430124463.0, + "step": 16980 + }, + { + "epoch": 1.8648144080825828, + "grad_norm": 2.0766842365264893, + "learning_rate": 1e-06, + "loss": 0.864, + "mean_token_accuracy": 0.7259265184402466, + "num_tokens": 430151103.0, + "step": 16981 + }, + { + "epoch": 1.8649242257851966, + "grad_norm": 2.3555307388305664, + "learning_rate": 1e-06, + "loss": 0.9097, + "mean_token_accuracy": 0.7073848247528076, + "num_tokens": 430176058.0, + "step": 16982 + }, + { + "epoch": 1.8650340434878103, + "grad_norm": 2.4317073822021484, + "learning_rate": 1e-06, + "loss": 0.9352, + "mean_token_accuracy": 0.712899923324585, + "num_tokens": 430199114.0, + "step": 16983 + }, + { + "epoch": 1.8651438611904239, + "grad_norm": 2.2432408332824707, + "learning_rate": 1e-06, + "loss": 0.861, + "mean_token_accuracy": 0.7265496850013733, + "num_tokens": 430223847.0, + "step": 16984 + }, + { + "epoch": 1.8652536788930374, + "grad_norm": 2.244245767593384, + "learning_rate": 1e-06, + "loss": 0.8705, + "mean_token_accuracy": 0.7231917381286621, + "num_tokens": 430247861.0, + "step": 16985 + }, + { + "epoch": 1.8653634965956511, + "grad_norm": 2.165036201477051, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.6979959011077881, + "num_tokens": 430277257.0, + "step": 16986 + }, + { + "epoch": 1.865473314298265, + "grad_norm": 2.1208720207214355, + "learning_rate": 1e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.7193146347999573, + "num_tokens": 430305712.0, + "step": 16987 + }, + { + "epoch": 1.8655831320008787, + "grad_norm": 2.324070453643799, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.7088735103607178, + "num_tokens": 430331546.0, + "step": 16988 + }, + { + "epoch": 1.8656929497034922, + "grad_norm": 2.3347725868225098, + "learning_rate": 1e-06, + "loss": 0.8889, + "mean_token_accuracy": 0.7212845087051392, + "num_tokens": 430356101.0, + "step": 16989 + }, + { + "epoch": 1.8658027674061057, + "grad_norm": 2.475809335708618, + "learning_rate": 1e-06, + "loss": 0.8775, + "mean_token_accuracy": 0.7206426858901978, + "num_tokens": 430377770.0, + "step": 16990 + }, + { + "epoch": 1.8659125851087195, + "grad_norm": 2.3416879177093506, + "learning_rate": 1e-06, + "loss": 0.8882, + "mean_token_accuracy": 0.7196241021156311, + "num_tokens": 430403481.0, + "step": 16991 + }, + { + "epoch": 1.8660224028113332, + "grad_norm": 2.1951241493225098, + "learning_rate": 1e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.7051828503608704, + "num_tokens": 430432078.0, + "step": 16992 + }, + { + "epoch": 1.866132220513947, + "grad_norm": 2.522317409515381, + "learning_rate": 1e-06, + "loss": 0.8644, + "mean_token_accuracy": 0.7354757785797119, + "num_tokens": 430453430.0, + "step": 16993 + }, + { + "epoch": 1.8662420382165605, + "grad_norm": 2.1487278938293457, + "learning_rate": 1e-06, + "loss": 0.8806, + "mean_token_accuracy": 0.7269365787506104, + "num_tokens": 430480207.0, + "step": 16994 + }, + { + "epoch": 1.866351855919174, + "grad_norm": 2.573819160461426, + "learning_rate": 1e-06, + "loss": 0.8068, + "mean_token_accuracy": 0.7440317869186401, + "num_tokens": 430500109.0, + "step": 16995 + }, + { + "epoch": 1.8664616736217878, + "grad_norm": 2.4556386470794678, + "learning_rate": 1e-06, + "loss": 0.9085, + "mean_token_accuracy": 0.7297312021255493, + "num_tokens": 430522890.0, + "step": 16996 + }, + { + "epoch": 1.8665714913244016, + "grad_norm": 2.4294285774230957, + "learning_rate": 1e-06, + "loss": 0.8627, + "mean_token_accuracy": 0.7431381940841675, + "num_tokens": 430544131.0, + "step": 16997 + }, + { + "epoch": 1.8666813090270151, + "grad_norm": 2.2068960666656494, + "learning_rate": 1e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7189792990684509, + "num_tokens": 430571350.0, + "step": 16998 + }, + { + "epoch": 1.8667911267296287, + "grad_norm": 2.555612325668335, + "learning_rate": 1e-06, + "loss": 0.8027, + "mean_token_accuracy": 0.7395082712173462, + "num_tokens": 430591816.0, + "step": 16999 + }, + { + "epoch": 1.8669009444322424, + "grad_norm": 2.0900845527648926, + "learning_rate": 1e-06, + "loss": 0.7744, + "mean_token_accuracy": 0.7531423568725586, + "num_tokens": 430618099.0, + "step": 17000 + }, + { + "epoch": 1.8670107621348562, + "grad_norm": 2.0834579467773438, + "learning_rate": 1e-06, + "loss": 0.9261, + "mean_token_accuracy": 0.7136757373809814, + "num_tokens": 430645833.0, + "step": 17001 + }, + { + "epoch": 1.86712057983747, + "grad_norm": 2.345022678375244, + "learning_rate": 1e-06, + "loss": 0.7865, + "mean_token_accuracy": 0.7506484389305115, + "num_tokens": 430667051.0, + "step": 17002 + }, + { + "epoch": 1.8672303975400835, + "grad_norm": 2.2497825622558594, + "learning_rate": 1e-06, + "loss": 0.8362, + "mean_token_accuracy": 0.7426929473876953, + "num_tokens": 430691273.0, + "step": 17003 + }, + { + "epoch": 1.867340215242697, + "grad_norm": 2.2240066528320312, + "learning_rate": 1e-06, + "loss": 0.8936, + "mean_token_accuracy": 0.718671441078186, + "num_tokens": 430715736.0, + "step": 17004 + }, + { + "epoch": 1.8674500329453108, + "grad_norm": 2.172173023223877, + "learning_rate": 1e-06, + "loss": 0.8246, + "mean_token_accuracy": 0.737500011920929, + "num_tokens": 430740171.0, + "step": 17005 + }, + { + "epoch": 1.8675598506479245, + "grad_norm": 2.2268741130828857, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7154535055160522, + "num_tokens": 430764028.0, + "step": 17006 + }, + { + "epoch": 1.8676696683505383, + "grad_norm": 2.3864705562591553, + "learning_rate": 1e-06, + "loss": 0.8997, + "mean_token_accuracy": 0.7195749878883362, + "num_tokens": 430788389.0, + "step": 17007 + }, + { + "epoch": 1.8677794860531518, + "grad_norm": 2.388408660888672, + "learning_rate": 1e-06, + "loss": 0.8799, + "mean_token_accuracy": 0.7226216793060303, + "num_tokens": 430811193.0, + "step": 17008 + }, + { + "epoch": 1.8678893037557653, + "grad_norm": 2.562699556350708, + "learning_rate": 1e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7170412540435791, + "num_tokens": 430831453.0, + "step": 17009 + }, + { + "epoch": 1.867999121458379, + "grad_norm": 2.340787887573242, + "learning_rate": 1e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.7134528756141663, + "num_tokens": 430855512.0, + "step": 17010 + }, + { + "epoch": 1.8681089391609929, + "grad_norm": 2.311748504638672, + "learning_rate": 1e-06, + "loss": 0.8488, + "mean_token_accuracy": 0.7305055856704712, + "num_tokens": 430879414.0, + "step": 17011 + }, + { + "epoch": 1.8682187568636064, + "grad_norm": 2.222980260848999, + "learning_rate": 1e-06, + "loss": 0.8959, + "mean_token_accuracy": 0.7273334264755249, + "num_tokens": 430905682.0, + "step": 17012 + }, + { + "epoch": 1.86832857456622, + "grad_norm": 2.246213674545288, + "learning_rate": 1e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.7245550155639648, + "num_tokens": 430931587.0, + "step": 17013 + }, + { + "epoch": 1.8684383922688337, + "grad_norm": 2.342177152633667, + "learning_rate": 1e-06, + "loss": 0.9206, + "mean_token_accuracy": 0.716117262840271, + "num_tokens": 430956334.0, + "step": 17014 + }, + { + "epoch": 1.8685482099714474, + "grad_norm": 2.381394147872925, + "learning_rate": 1e-06, + "loss": 0.8902, + "mean_token_accuracy": 0.7132844924926758, + "num_tokens": 430979442.0, + "step": 17015 + }, + { + "epoch": 1.8686580276740612, + "grad_norm": 2.230339765548706, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.707083523273468, + "num_tokens": 431005667.0, + "step": 17016 + }, + { + "epoch": 1.8687678453766747, + "grad_norm": 2.024409532546997, + "learning_rate": 1e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7123619318008423, + "num_tokens": 431036051.0, + "step": 17017 + }, + { + "epoch": 1.8688776630792883, + "grad_norm": 2.106278657913208, + "learning_rate": 1e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.7201119661331177, + "num_tokens": 431064330.0, + "step": 17018 + }, + { + "epoch": 1.868987480781902, + "grad_norm": 1.9657418727874756, + "learning_rate": 1e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.7037059664726257, + "num_tokens": 431097779.0, + "step": 17019 + }, + { + "epoch": 1.8690972984845158, + "grad_norm": 2.3737103939056396, + "learning_rate": 1e-06, + "loss": 0.8004, + "mean_token_accuracy": 0.7524742484092712, + "num_tokens": 431120291.0, + "step": 17020 + }, + { + "epoch": 1.8692071161871293, + "grad_norm": 2.2122433185577393, + "learning_rate": 1e-06, + "loss": 0.8622, + "mean_token_accuracy": 0.7287253141403198, + "num_tokens": 431144748.0, + "step": 17021 + }, + { + "epoch": 1.869316933889743, + "grad_norm": 2.1062357425689697, + "learning_rate": 1e-06, + "loss": 0.8751, + "mean_token_accuracy": 0.7247079610824585, + "num_tokens": 431170426.0, + "step": 17022 + }, + { + "epoch": 1.8694267515923566, + "grad_norm": 2.140568971633911, + "learning_rate": 1e-06, + "loss": 0.8499, + "mean_token_accuracy": 0.7318243384361267, + "num_tokens": 431196615.0, + "step": 17023 + }, + { + "epoch": 1.8695365692949704, + "grad_norm": 2.2129759788513184, + "learning_rate": 1e-06, + "loss": 0.8855, + "mean_token_accuracy": 0.7168768644332886, + "num_tokens": 431223059.0, + "step": 17024 + }, + { + "epoch": 1.8696463869975841, + "grad_norm": 2.440246105194092, + "learning_rate": 1e-06, + "loss": 0.8119, + "mean_token_accuracy": 0.7400764226913452, + "num_tokens": 431246138.0, + "step": 17025 + }, + { + "epoch": 1.8697562047001977, + "grad_norm": 2.260613203048706, + "learning_rate": 1e-06, + "loss": 0.8349, + "mean_token_accuracy": 0.7362701892852783, + "num_tokens": 431269099.0, + "step": 17026 + }, + { + "epoch": 1.8698660224028112, + "grad_norm": 2.2960305213928223, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7004229426383972, + "num_tokens": 431294609.0, + "step": 17027 + }, + { + "epoch": 1.869975840105425, + "grad_norm": 2.3655025959014893, + "learning_rate": 1e-06, + "loss": 0.8686, + "mean_token_accuracy": 0.7272631525993347, + "num_tokens": 431316804.0, + "step": 17028 + }, + { + "epoch": 1.8700856578080387, + "grad_norm": 2.242915630340576, + "learning_rate": 1e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.7030090689659119, + "num_tokens": 431343352.0, + "step": 17029 + }, + { + "epoch": 1.8701954755106525, + "grad_norm": 2.2760884761810303, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7173671722412109, + "num_tokens": 431370547.0, + "step": 17030 + }, + { + "epoch": 1.870305293213266, + "grad_norm": 2.913874387741089, + "learning_rate": 1e-06, + "loss": 0.724, + "mean_token_accuracy": 0.7572324275970459, + "num_tokens": 431386122.0, + "step": 17031 + }, + { + "epoch": 1.8704151109158795, + "grad_norm": 2.169557809829712, + "learning_rate": 1e-06, + "loss": 0.8694, + "mean_token_accuracy": 0.7284398674964905, + "num_tokens": 431411186.0, + "step": 17032 + }, + { + "epoch": 1.8705249286184933, + "grad_norm": 2.4718315601348877, + "learning_rate": 1e-06, + "loss": 0.8648, + "mean_token_accuracy": 0.732635498046875, + "num_tokens": 431433213.0, + "step": 17033 + }, + { + "epoch": 1.870634746321107, + "grad_norm": 2.0886943340301514, + "learning_rate": 1e-06, + "loss": 0.8995, + "mean_token_accuracy": 0.7216233015060425, + "num_tokens": 431462350.0, + "step": 17034 + }, + { + "epoch": 1.8707445640237206, + "grad_norm": 2.2717084884643555, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7200744152069092, + "num_tokens": 431487903.0, + "step": 17035 + }, + { + "epoch": 1.8708543817263343, + "grad_norm": 2.156365394592285, + "learning_rate": 1e-06, + "loss": 0.9614, + "mean_token_accuracy": 0.706413745880127, + "num_tokens": 431515709.0, + "step": 17036 + }, + { + "epoch": 1.8709641994289479, + "grad_norm": 2.2517507076263428, + "learning_rate": 1e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7124691009521484, + "num_tokens": 431540063.0, + "step": 17037 + }, + { + "epoch": 1.8710740171315616, + "grad_norm": 2.4485840797424316, + "learning_rate": 1e-06, + "loss": 0.8177, + "mean_token_accuracy": 0.7406395077705383, + "num_tokens": 431562644.0, + "step": 17038 + }, + { + "epoch": 1.8711838348341754, + "grad_norm": 2.1916613578796387, + "learning_rate": 1e-06, + "loss": 0.8983, + "mean_token_accuracy": 0.7253581285476685, + "num_tokens": 431588087.0, + "step": 17039 + }, + { + "epoch": 1.871293652536789, + "grad_norm": 2.238131284713745, + "learning_rate": 1e-06, + "loss": 0.9529, + "mean_token_accuracy": 0.7016046047210693, + "num_tokens": 431614226.0, + "step": 17040 + }, + { + "epoch": 1.8714034702394025, + "grad_norm": 2.6850197315216064, + "learning_rate": 1e-06, + "loss": 0.8057, + "mean_token_accuracy": 0.7461265921592712, + "num_tokens": 431632814.0, + "step": 17041 + }, + { + "epoch": 1.8715132879420162, + "grad_norm": 2.2553091049194336, + "learning_rate": 1e-06, + "loss": 0.8383, + "mean_token_accuracy": 0.735970139503479, + "num_tokens": 431657728.0, + "step": 17042 + }, + { + "epoch": 1.87162310564463, + "grad_norm": 2.1438515186309814, + "learning_rate": 1e-06, + "loss": 0.823, + "mean_token_accuracy": 0.7389360666275024, + "num_tokens": 431683319.0, + "step": 17043 + }, + { + "epoch": 1.8717329233472437, + "grad_norm": 2.137983560562134, + "learning_rate": 1e-06, + "loss": 0.9261, + "mean_token_accuracy": 0.717492401599884, + "num_tokens": 431709245.0, + "step": 17044 + }, + { + "epoch": 1.8718427410498573, + "grad_norm": 2.439837694168091, + "learning_rate": 1e-06, + "loss": 0.7605, + "mean_token_accuracy": 0.7589716911315918, + "num_tokens": 431730586.0, + "step": 17045 + }, + { + "epoch": 1.8719525587524708, + "grad_norm": 2.077913284301758, + "learning_rate": 1e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.7036059498786926, + "num_tokens": 431764875.0, + "step": 17046 + }, + { + "epoch": 1.8720623764550846, + "grad_norm": 2.376767158508301, + "learning_rate": 1e-06, + "loss": 0.8983, + "mean_token_accuracy": 0.7229292392730713, + "num_tokens": 431786035.0, + "step": 17047 + }, + { + "epoch": 1.8721721941576983, + "grad_norm": 2.2140839099884033, + "learning_rate": 1e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.7252588272094727, + "num_tokens": 431811485.0, + "step": 17048 + }, + { + "epoch": 1.8722820118603118, + "grad_norm": 2.242549419403076, + "learning_rate": 1e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7026188373565674, + "num_tokens": 431837987.0, + "step": 17049 + }, + { + "epoch": 1.8723918295629254, + "grad_norm": 2.437445640563965, + "learning_rate": 1e-06, + "loss": 0.8937, + "mean_token_accuracy": 0.721463680267334, + "num_tokens": 431862759.0, + "step": 17050 + }, + { + "epoch": 1.8725016472655391, + "grad_norm": 2.2771155834198, + "learning_rate": 1e-06, + "loss": 0.8633, + "mean_token_accuracy": 0.7251402139663696, + "num_tokens": 431887599.0, + "step": 17051 + }, + { + "epoch": 1.872611464968153, + "grad_norm": 2.160022258758545, + "learning_rate": 1e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.7015305757522583, + "num_tokens": 431915637.0, + "step": 17052 + }, + { + "epoch": 1.8727212826707667, + "grad_norm": 2.1791746616363525, + "learning_rate": 1e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.723513126373291, + "num_tokens": 431941132.0, + "step": 17053 + }, + { + "epoch": 1.8728311003733802, + "grad_norm": 2.1732466220855713, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.7037670612335205, + "num_tokens": 431968802.0, + "step": 17054 + }, + { + "epoch": 1.8729409180759937, + "grad_norm": 2.1649558544158936, + "learning_rate": 1e-06, + "loss": 0.966, + "mean_token_accuracy": 0.7065377235412598, + "num_tokens": 431996604.0, + "step": 17055 + }, + { + "epoch": 1.8730507357786075, + "grad_norm": 2.3018133640289307, + "learning_rate": 1e-06, + "loss": 0.9085, + "mean_token_accuracy": 0.7187610864639282, + "num_tokens": 432020506.0, + "step": 17056 + }, + { + "epoch": 1.8731605534812212, + "grad_norm": 2.1906068325042725, + "learning_rate": 1e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7248493432998657, + "num_tokens": 432047993.0, + "step": 17057 + }, + { + "epoch": 1.873270371183835, + "grad_norm": 2.8219830989837646, + "learning_rate": 1e-06, + "loss": 0.7961, + "mean_token_accuracy": 0.7473547458648682, + "num_tokens": 432065281.0, + "step": 17058 + }, + { + "epoch": 1.8733801888864485, + "grad_norm": 2.6738951206207275, + "learning_rate": 1e-06, + "loss": 0.8386, + "mean_token_accuracy": 0.732610821723938, + "num_tokens": 432084532.0, + "step": 17059 + }, + { + "epoch": 1.873490006589062, + "grad_norm": 2.0262749195098877, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.7114837169647217, + "num_tokens": 432116156.0, + "step": 17060 + }, + { + "epoch": 1.8735998242916758, + "grad_norm": 2.00565242767334, + "learning_rate": 1e-06, + "loss": 1.024, + "mean_token_accuracy": 0.68711256980896, + "num_tokens": 432151076.0, + "step": 17061 + }, + { + "epoch": 1.8737096419942896, + "grad_norm": 2.3721840381622314, + "learning_rate": 1e-06, + "loss": 0.8347, + "mean_token_accuracy": 0.7354810237884521, + "num_tokens": 432172991.0, + "step": 17062 + }, + { + "epoch": 1.8738194596969031, + "grad_norm": 2.355308771133423, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7139136791229248, + "num_tokens": 432197431.0, + "step": 17063 + }, + { + "epoch": 1.8739292773995166, + "grad_norm": 2.1339244842529297, + "learning_rate": 1e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7060781717300415, + "num_tokens": 432227054.0, + "step": 17064 + }, + { + "epoch": 1.8740390951021304, + "grad_norm": 2.1945927143096924, + "learning_rate": 1e-06, + "loss": 0.8149, + "mean_token_accuracy": 0.7478062510490417, + "num_tokens": 432252104.0, + "step": 17065 + }, + { + "epoch": 1.8741489128047442, + "grad_norm": 2.592688798904419, + "learning_rate": 1e-06, + "loss": 0.8288, + "mean_token_accuracy": 0.7369297742843628, + "num_tokens": 432271492.0, + "step": 17066 + }, + { + "epoch": 1.874258730507358, + "grad_norm": 2.1583573818206787, + "learning_rate": 1e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.698137640953064, + "num_tokens": 432298092.0, + "step": 17067 + }, + { + "epoch": 1.8743685482099715, + "grad_norm": 2.0921144485473633, + "learning_rate": 1e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.7283058762550354, + "num_tokens": 432327391.0, + "step": 17068 + }, + { + "epoch": 1.874478365912585, + "grad_norm": 2.19087553024292, + "learning_rate": 1e-06, + "loss": 0.9234, + "mean_token_accuracy": 0.7280000448226929, + "num_tokens": 432353446.0, + "step": 17069 + }, + { + "epoch": 1.8745881836151987, + "grad_norm": 2.2165005207061768, + "learning_rate": 1e-06, + "loss": 0.8697, + "mean_token_accuracy": 0.7313222885131836, + "num_tokens": 432379471.0, + "step": 17070 + }, + { + "epoch": 1.8746980013178125, + "grad_norm": 2.3184280395507812, + "learning_rate": 1e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.7218173742294312, + "num_tokens": 432405685.0, + "step": 17071 + }, + { + "epoch": 1.8748078190204263, + "grad_norm": 2.0604686737060547, + "learning_rate": 1e-06, + "loss": 0.917, + "mean_token_accuracy": 0.7072727084159851, + "num_tokens": 432436797.0, + "step": 17072 + }, + { + "epoch": 1.8749176367230398, + "grad_norm": 2.2790074348449707, + "learning_rate": 1e-06, + "loss": 0.9071, + "mean_token_accuracy": 0.7212385535240173, + "num_tokens": 432461797.0, + "step": 17073 + }, + { + "epoch": 1.8750274544256533, + "grad_norm": 2.8361785411834717, + "learning_rate": 1e-06, + "loss": 0.9058, + "mean_token_accuracy": 0.7159266471862793, + "num_tokens": 432480593.0, + "step": 17074 + }, + { + "epoch": 1.875137272128267, + "grad_norm": 2.33927583694458, + "learning_rate": 1e-06, + "loss": 0.8738, + "mean_token_accuracy": 0.726075291633606, + "num_tokens": 432503648.0, + "step": 17075 + }, + { + "epoch": 1.8752470898308808, + "grad_norm": 2.2559633255004883, + "learning_rate": 1e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.7152194976806641, + "num_tokens": 432528699.0, + "step": 17076 + }, + { + "epoch": 1.8753569075334944, + "grad_norm": 2.343045711517334, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7147738933563232, + "num_tokens": 432552404.0, + "step": 17077 + }, + { + "epoch": 1.875466725236108, + "grad_norm": 2.2858598232269287, + "learning_rate": 1e-06, + "loss": 0.9552, + "mean_token_accuracy": 0.713697075843811, + "num_tokens": 432577137.0, + "step": 17078 + }, + { + "epoch": 1.8755765429387217, + "grad_norm": 2.0160372257232666, + "learning_rate": 1e-06, + "loss": 0.8716, + "mean_token_accuracy": 0.7297306656837463, + "num_tokens": 432608582.0, + "step": 17079 + }, + { + "epoch": 1.8756863606413354, + "grad_norm": 2.4319167137145996, + "learning_rate": 1e-06, + "loss": 0.8283, + "mean_token_accuracy": 0.7413287162780762, + "num_tokens": 432632777.0, + "step": 17080 + }, + { + "epoch": 1.8757961783439492, + "grad_norm": 2.4161579608917236, + "learning_rate": 1e-06, + "loss": 0.908, + "mean_token_accuracy": 0.7218247652053833, + "num_tokens": 432658792.0, + "step": 17081 + }, + { + "epoch": 1.8759059960465627, + "grad_norm": 2.259422540664673, + "learning_rate": 1e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7335473299026489, + "num_tokens": 432684738.0, + "step": 17082 + }, + { + "epoch": 1.8760158137491763, + "grad_norm": 2.560251235961914, + "learning_rate": 1e-06, + "loss": 0.9024, + "mean_token_accuracy": 0.7250589728355408, + "num_tokens": 432705450.0, + "step": 17083 + }, + { + "epoch": 1.87612563145179, + "grad_norm": 2.1844189167022705, + "learning_rate": 1e-06, + "loss": 0.8932, + "mean_token_accuracy": 0.7254455089569092, + "num_tokens": 432730678.0, + "step": 17084 + }, + { + "epoch": 1.8762354491544038, + "grad_norm": 2.1704046726226807, + "learning_rate": 1e-06, + "loss": 0.8722, + "mean_token_accuracy": 0.7295886278152466, + "num_tokens": 432757996.0, + "step": 17085 + }, + { + "epoch": 1.8763452668570173, + "grad_norm": 2.049464702606201, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7097486257553101, + "num_tokens": 432789255.0, + "step": 17086 + }, + { + "epoch": 1.876455084559631, + "grad_norm": 2.316477060317993, + "learning_rate": 1e-06, + "loss": 0.8546, + "mean_token_accuracy": 0.7311739921569824, + "num_tokens": 432813501.0, + "step": 17087 + }, + { + "epoch": 1.8765649022622446, + "grad_norm": 2.4948551654815674, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7241586446762085, + "num_tokens": 432834203.0, + "step": 17088 + }, + { + "epoch": 1.8766747199648584, + "grad_norm": 2.185736656188965, + "learning_rate": 1e-06, + "loss": 0.9047, + "mean_token_accuracy": 0.7152881622314453, + "num_tokens": 432861593.0, + "step": 17089 + }, + { + "epoch": 1.876784537667472, + "grad_norm": 2.2118337154388428, + "learning_rate": 1e-06, + "loss": 0.92, + "mean_token_accuracy": 0.7168738842010498, + "num_tokens": 432886240.0, + "step": 17090 + }, + { + "epoch": 1.8768943553700856, + "grad_norm": 2.244112014770508, + "learning_rate": 1e-06, + "loss": 0.8261, + "mean_token_accuracy": 0.7450497150421143, + "num_tokens": 432909536.0, + "step": 17091 + }, + { + "epoch": 1.8770041730726992, + "grad_norm": 2.1400017738342285, + "learning_rate": 1e-06, + "loss": 0.8841, + "mean_token_accuracy": 0.7175275087356567, + "num_tokens": 432936690.0, + "step": 17092 + }, + { + "epoch": 1.877113990775313, + "grad_norm": 2.1548306941986084, + "learning_rate": 1e-06, + "loss": 0.995, + "mean_token_accuracy": 0.689200758934021, + "num_tokens": 432963099.0, + "step": 17093 + }, + { + "epoch": 1.8772238084779267, + "grad_norm": 2.6437578201293945, + "learning_rate": 1e-06, + "loss": 0.8957, + "mean_token_accuracy": 0.7222744226455688, + "num_tokens": 432984716.0, + "step": 17094 + }, + { + "epoch": 1.8773336261805404, + "grad_norm": 2.122345447540283, + "learning_rate": 1e-06, + "loss": 0.8519, + "mean_token_accuracy": 0.7429929971694946, + "num_tokens": 433011179.0, + "step": 17095 + }, + { + "epoch": 1.877443443883154, + "grad_norm": 2.216254234313965, + "learning_rate": 1e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.7247463464736938, + "num_tokens": 433036197.0, + "step": 17096 + }, + { + "epoch": 1.8775532615857675, + "grad_norm": 2.0905706882476807, + "learning_rate": 1e-06, + "loss": 0.8576, + "mean_token_accuracy": 0.7290002107620239, + "num_tokens": 433062883.0, + "step": 17097 + }, + { + "epoch": 1.8776630792883813, + "grad_norm": 2.4755120277404785, + "learning_rate": 1e-06, + "loss": 0.8513, + "mean_token_accuracy": 0.7329373359680176, + "num_tokens": 433083124.0, + "step": 17098 + }, + { + "epoch": 1.877772896990995, + "grad_norm": 2.39194393157959, + "learning_rate": 1e-06, + "loss": 0.8089, + "mean_token_accuracy": 0.7474839091300964, + "num_tokens": 433104304.0, + "step": 17099 + }, + { + "epoch": 1.8778827146936086, + "grad_norm": 1.9192883968353271, + "learning_rate": 1e-06, + "loss": 0.9595, + "mean_token_accuracy": 0.7075059413909912, + "num_tokens": 433138827.0, + "step": 17100 + }, + { + "epoch": 1.8779925323962223, + "grad_norm": 2.3741228580474854, + "learning_rate": 1e-06, + "loss": 0.92, + "mean_token_accuracy": 0.7100538015365601, + "num_tokens": 433163660.0, + "step": 17101 + }, + { + "epoch": 1.8781023500988359, + "grad_norm": 2.4012815952301025, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7186300158500671, + "num_tokens": 433186926.0, + "step": 17102 + }, + { + "epoch": 1.8782121678014496, + "grad_norm": 2.2724292278289795, + "learning_rate": 1e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.6959007978439331, + "num_tokens": 433213100.0, + "step": 17103 + }, + { + "epoch": 1.8783219855040634, + "grad_norm": 2.5792551040649414, + "learning_rate": 1e-06, + "loss": 0.796, + "mean_token_accuracy": 0.7447726726531982, + "num_tokens": 433232667.0, + "step": 17104 + }, + { + "epoch": 1.878431803206677, + "grad_norm": 2.135744571685791, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7068163752555847, + "num_tokens": 433261095.0, + "step": 17105 + }, + { + "epoch": 1.8785416209092904, + "grad_norm": 1.9173487424850464, + "learning_rate": 1e-06, + "loss": 0.8962, + "mean_token_accuracy": 0.72154301404953, + "num_tokens": 433295396.0, + "step": 17106 + }, + { + "epoch": 1.8786514386119042, + "grad_norm": 2.4362354278564453, + "learning_rate": 1e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.7228226661682129, + "num_tokens": 433319909.0, + "step": 17107 + }, + { + "epoch": 1.878761256314518, + "grad_norm": 2.474358558654785, + "learning_rate": 1e-06, + "loss": 0.8572, + "mean_token_accuracy": 0.7356549501419067, + "num_tokens": 433339918.0, + "step": 17108 + }, + { + "epoch": 1.8788710740171317, + "grad_norm": 2.068525791168213, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7212176322937012, + "num_tokens": 433367597.0, + "step": 17109 + }, + { + "epoch": 1.8789808917197452, + "grad_norm": 1.9214165210723877, + "learning_rate": 1e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7127189040184021, + "num_tokens": 433401243.0, + "step": 17110 + }, + { + "epoch": 1.8790907094223588, + "grad_norm": 2.0308351516723633, + "learning_rate": 1e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.7052578926086426, + "num_tokens": 433430209.0, + "step": 17111 + }, + { + "epoch": 1.8792005271249725, + "grad_norm": 2.3672571182250977, + "learning_rate": 1e-06, + "loss": 0.9038, + "mean_token_accuracy": 0.7283599376678467, + "num_tokens": 433454021.0, + "step": 17112 + }, + { + "epoch": 1.8793103448275863, + "grad_norm": 2.2523531913757324, + "learning_rate": 1e-06, + "loss": 0.8778, + "mean_token_accuracy": 0.7339519262313843, + "num_tokens": 433478872.0, + "step": 17113 + }, + { + "epoch": 1.8794201625301998, + "grad_norm": 2.2983129024505615, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.7010762691497803, + "num_tokens": 433505304.0, + "step": 17114 + }, + { + "epoch": 1.8795299802328134, + "grad_norm": 2.555589199066162, + "learning_rate": 1e-06, + "loss": 0.846, + "mean_token_accuracy": 0.7266747951507568, + "num_tokens": 433525651.0, + "step": 17115 + }, + { + "epoch": 1.8796397979354271, + "grad_norm": 2.0780246257781982, + "learning_rate": 1e-06, + "loss": 0.88, + "mean_token_accuracy": 0.7207863330841064, + "num_tokens": 433553569.0, + "step": 17116 + }, + { + "epoch": 1.8797496156380409, + "grad_norm": 2.4637365341186523, + "learning_rate": 1e-06, + "loss": 0.9588, + "mean_token_accuracy": 0.7061717510223389, + "num_tokens": 433575440.0, + "step": 17117 + }, + { + "epoch": 1.8798594333406546, + "grad_norm": 2.385667562484741, + "learning_rate": 1e-06, + "loss": 0.8057, + "mean_token_accuracy": 0.7500320672988892, + "num_tokens": 433596750.0, + "step": 17118 + }, + { + "epoch": 1.8799692510432682, + "grad_norm": 2.254307270050049, + "learning_rate": 1e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.7046984434127808, + "num_tokens": 433623489.0, + "step": 17119 + }, + { + "epoch": 1.8800790687458817, + "grad_norm": 2.4095513820648193, + "learning_rate": 1e-06, + "loss": 0.8865, + "mean_token_accuracy": 0.7236401438713074, + "num_tokens": 433646121.0, + "step": 17120 + }, + { + "epoch": 1.8801888864484955, + "grad_norm": 2.5785908699035645, + "learning_rate": 1e-06, + "loss": 0.7962, + "mean_token_accuracy": 0.7439038753509521, + "num_tokens": 433664645.0, + "step": 17121 + }, + { + "epoch": 1.8802987041511092, + "grad_norm": 1.9782415628433228, + "learning_rate": 1e-06, + "loss": 0.8757, + "mean_token_accuracy": 0.7280499339103699, + "num_tokens": 433694082.0, + "step": 17122 + }, + { + "epoch": 1.880408521853723, + "grad_norm": 2.190889835357666, + "learning_rate": 1e-06, + "loss": 0.9567, + "mean_token_accuracy": 0.7112032175064087, + "num_tokens": 433720815.0, + "step": 17123 + }, + { + "epoch": 1.8805183395563365, + "grad_norm": 2.1906392574310303, + "learning_rate": 1e-06, + "loss": 0.9056, + "mean_token_accuracy": 0.7132795453071594, + "num_tokens": 433746510.0, + "step": 17124 + }, + { + "epoch": 1.88062815725895, + "grad_norm": 2.0252819061279297, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7066215872764587, + "num_tokens": 433775804.0, + "step": 17125 + }, + { + "epoch": 1.8807379749615638, + "grad_norm": 1.9800266027450562, + "learning_rate": 1e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.7155151963233948, + "num_tokens": 433806840.0, + "step": 17126 + }, + { + "epoch": 1.8808477926641776, + "grad_norm": 2.3574812412261963, + "learning_rate": 1e-06, + "loss": 0.8814, + "mean_token_accuracy": 0.7257044911384583, + "num_tokens": 433830076.0, + "step": 17127 + }, + { + "epoch": 1.880957610366791, + "grad_norm": 2.2027995586395264, + "learning_rate": 1e-06, + "loss": 0.9139, + "mean_token_accuracy": 0.7151477932929993, + "num_tokens": 433859139.0, + "step": 17128 + }, + { + "epoch": 1.8810674280694046, + "grad_norm": 2.2877066135406494, + "learning_rate": 1e-06, + "loss": 0.8644, + "mean_token_accuracy": 0.728644073009491, + "num_tokens": 433883934.0, + "step": 17129 + }, + { + "epoch": 1.8811772457720184, + "grad_norm": 2.36344575881958, + "learning_rate": 1e-06, + "loss": 0.809, + "mean_token_accuracy": 0.7414419054985046, + "num_tokens": 433904916.0, + "step": 17130 + }, + { + "epoch": 1.8812870634746321, + "grad_norm": 2.2793686389923096, + "learning_rate": 1e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.7199028730392456, + "num_tokens": 433931210.0, + "step": 17131 + }, + { + "epoch": 1.881396881177246, + "grad_norm": 2.3611433506011963, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.7127453684806824, + "num_tokens": 433955208.0, + "step": 17132 + }, + { + "epoch": 1.8815066988798594, + "grad_norm": 2.3390495777130127, + "learning_rate": 1e-06, + "loss": 0.8195, + "mean_token_accuracy": 0.7452244758605957, + "num_tokens": 433978693.0, + "step": 17133 + }, + { + "epoch": 1.881616516582473, + "grad_norm": 2.1064534187316895, + "learning_rate": 1e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.7142917513847351, + "num_tokens": 434006525.0, + "step": 17134 + }, + { + "epoch": 1.8817263342850867, + "grad_norm": 2.321193218231201, + "learning_rate": 1e-06, + "loss": 0.8447, + "mean_token_accuracy": 0.7359104752540588, + "num_tokens": 434030552.0, + "step": 17135 + }, + { + "epoch": 1.8818361519877005, + "grad_norm": 2.4623217582702637, + "learning_rate": 1e-06, + "loss": 0.8417, + "mean_token_accuracy": 0.7403452396392822, + "num_tokens": 434051837.0, + "step": 17136 + }, + { + "epoch": 1.881945969690314, + "grad_norm": 2.4084832668304443, + "learning_rate": 1e-06, + "loss": 0.9183, + "mean_token_accuracy": 0.7253976464271545, + "num_tokens": 434076156.0, + "step": 17137 + }, + { + "epoch": 1.8820557873929278, + "grad_norm": 2.5344810485839844, + "learning_rate": 1e-06, + "loss": 0.7753, + "mean_token_accuracy": 0.747439980506897, + "num_tokens": 434096189.0, + "step": 17138 + }, + { + "epoch": 1.8821656050955413, + "grad_norm": 2.3071699142456055, + "learning_rate": 1e-06, + "loss": 0.8283, + "mean_token_accuracy": 0.7411420941352844, + "num_tokens": 434118911.0, + "step": 17139 + }, + { + "epoch": 1.882275422798155, + "grad_norm": 2.384735345840454, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.7011035680770874, + "num_tokens": 434144740.0, + "step": 17140 + }, + { + "epoch": 1.8823852405007688, + "grad_norm": 2.3447437286376953, + "learning_rate": 1e-06, + "loss": 0.8615, + "mean_token_accuracy": 0.7271298170089722, + "num_tokens": 434169473.0, + "step": 17141 + }, + { + "epoch": 1.8824950582033824, + "grad_norm": 1.896385669708252, + "learning_rate": 1e-06, + "loss": 0.8643, + "mean_token_accuracy": 0.7418617606163025, + "num_tokens": 434200896.0, + "step": 17142 + }, + { + "epoch": 1.882604875905996, + "grad_norm": 2.226064920425415, + "learning_rate": 1e-06, + "loss": 0.8657, + "mean_token_accuracy": 0.7226697206497192, + "num_tokens": 434226450.0, + "step": 17143 + }, + { + "epoch": 1.8827146936086097, + "grad_norm": 2.360222578048706, + "learning_rate": 1e-06, + "loss": 0.9002, + "mean_token_accuracy": 0.7254379987716675, + "num_tokens": 434251338.0, + "step": 17144 + }, + { + "epoch": 1.8828245113112234, + "grad_norm": 2.3798882961273193, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7115763425827026, + "num_tokens": 434274257.0, + "step": 17145 + }, + { + "epoch": 1.8829343290138372, + "grad_norm": 2.2584521770477295, + "learning_rate": 1e-06, + "loss": 0.8833, + "mean_token_accuracy": 0.7226110696792603, + "num_tokens": 434299271.0, + "step": 17146 + }, + { + "epoch": 1.8830441467164507, + "grad_norm": 2.2963263988494873, + "learning_rate": 1e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.7052915096282959, + "num_tokens": 434325067.0, + "step": 17147 + }, + { + "epoch": 1.8831539644190642, + "grad_norm": 2.0257179737091064, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7052534222602844, + "num_tokens": 434355591.0, + "step": 17148 + }, + { + "epoch": 1.883263782121678, + "grad_norm": 2.2144076824188232, + "learning_rate": 1e-06, + "loss": 0.8396, + "mean_token_accuracy": 0.7381184697151184, + "num_tokens": 434378535.0, + "step": 17149 + }, + { + "epoch": 1.8833735998242918, + "grad_norm": 2.127774238586426, + "learning_rate": 1e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.715682864189148, + "num_tokens": 434406216.0, + "step": 17150 + }, + { + "epoch": 1.8834834175269053, + "grad_norm": 2.3214118480682373, + "learning_rate": 1e-06, + "loss": 0.847, + "mean_token_accuracy": 0.7288256883621216, + "num_tokens": 434430239.0, + "step": 17151 + }, + { + "epoch": 1.883593235229519, + "grad_norm": 2.2257297039031982, + "learning_rate": 1e-06, + "loss": 0.8211, + "mean_token_accuracy": 0.741313099861145, + "num_tokens": 434454305.0, + "step": 17152 + }, + { + "epoch": 1.8837030529321326, + "grad_norm": 2.6222832202911377, + "learning_rate": 1e-06, + "loss": 0.8638, + "mean_token_accuracy": 0.7273779511451721, + "num_tokens": 434474051.0, + "step": 17153 + }, + { + "epoch": 1.8838128706347463, + "grad_norm": 2.297727346420288, + "learning_rate": 1e-06, + "loss": 0.842, + "mean_token_accuracy": 0.7271547317504883, + "num_tokens": 434497443.0, + "step": 17154 + }, + { + "epoch": 1.88392268833736, + "grad_norm": 2.2259790897369385, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.7242011427879333, + "num_tokens": 434521928.0, + "step": 17155 + }, + { + "epoch": 1.8840325060399736, + "grad_norm": 2.2781248092651367, + "learning_rate": 1e-06, + "loss": 0.809, + "mean_token_accuracy": 0.735000491142273, + "num_tokens": 434544175.0, + "step": 17156 + }, + { + "epoch": 1.8841423237425872, + "grad_norm": 2.4002130031585693, + "learning_rate": 1e-06, + "loss": 0.8912, + "mean_token_accuracy": 0.7263489961624146, + "num_tokens": 434566801.0, + "step": 17157 + }, + { + "epoch": 1.884252141445201, + "grad_norm": 2.2551350593566895, + "learning_rate": 1e-06, + "loss": 0.8605, + "mean_token_accuracy": 0.7253998517990112, + "num_tokens": 434592127.0, + "step": 17158 + }, + { + "epoch": 1.8843619591478147, + "grad_norm": 2.6819169521331787, + "learning_rate": 1e-06, + "loss": 0.8435, + "mean_token_accuracy": 0.7337014675140381, + "num_tokens": 434611204.0, + "step": 17159 + }, + { + "epoch": 1.8844717768504284, + "grad_norm": 2.208688735961914, + "learning_rate": 1e-06, + "loss": 0.8631, + "mean_token_accuracy": 0.7358965277671814, + "num_tokens": 434636949.0, + "step": 17160 + }, + { + "epoch": 1.884581594553042, + "grad_norm": 2.469222068786621, + "learning_rate": 1e-06, + "loss": 0.8866, + "mean_token_accuracy": 0.7203596830368042, + "num_tokens": 434657998.0, + "step": 17161 + }, + { + "epoch": 1.8846914122556555, + "grad_norm": 2.313453197479248, + "learning_rate": 1e-06, + "loss": 0.8466, + "mean_token_accuracy": 0.7333974838256836, + "num_tokens": 434680321.0, + "step": 17162 + }, + { + "epoch": 1.8848012299582693, + "grad_norm": 2.1828629970550537, + "learning_rate": 1e-06, + "loss": 0.8458, + "mean_token_accuracy": 0.73724365234375, + "num_tokens": 434706604.0, + "step": 17163 + }, + { + "epoch": 1.884911047660883, + "grad_norm": 2.1251308917999268, + "learning_rate": 1e-06, + "loss": 0.8842, + "mean_token_accuracy": 0.7239513993263245, + "num_tokens": 434734543.0, + "step": 17164 + }, + { + "epoch": 1.8850208653634966, + "grad_norm": 2.021676540374756, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.7084991931915283, + "num_tokens": 434766354.0, + "step": 17165 + }, + { + "epoch": 1.88513068306611, + "grad_norm": 2.186020851135254, + "learning_rate": 1e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.7160212397575378, + "num_tokens": 434793055.0, + "step": 17166 + }, + { + "epoch": 1.8852405007687238, + "grad_norm": 2.3196768760681152, + "learning_rate": 1e-06, + "loss": 0.9, + "mean_token_accuracy": 0.7250131368637085, + "num_tokens": 434817450.0, + "step": 17167 + }, + { + "epoch": 1.8853503184713376, + "grad_norm": 2.2651734352111816, + "learning_rate": 1e-06, + "loss": 0.8254, + "mean_token_accuracy": 0.73775315284729, + "num_tokens": 434841863.0, + "step": 17168 + }, + { + "epoch": 1.8854601361739514, + "grad_norm": 2.206240653991699, + "learning_rate": 1e-06, + "loss": 0.9001, + "mean_token_accuracy": 0.7282891869544983, + "num_tokens": 434867780.0, + "step": 17169 + }, + { + "epoch": 1.885569953876565, + "grad_norm": 2.025034189224243, + "learning_rate": 1e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.7107523679733276, + "num_tokens": 434896223.0, + "step": 17170 + }, + { + "epoch": 1.8856797715791784, + "grad_norm": 2.0575320720672607, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7103714942932129, + "num_tokens": 434926567.0, + "step": 17171 + }, + { + "epoch": 1.8857895892817922, + "grad_norm": 2.358414888381958, + "learning_rate": 1e-06, + "loss": 0.8525, + "mean_token_accuracy": 0.7343640327453613, + "num_tokens": 434948940.0, + "step": 17172 + }, + { + "epoch": 1.885899406984406, + "grad_norm": 2.230151891708374, + "learning_rate": 1e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.7087170481681824, + "num_tokens": 434972849.0, + "step": 17173 + }, + { + "epoch": 1.8860092246870197, + "grad_norm": 2.6649880409240723, + "learning_rate": 1e-06, + "loss": 0.8931, + "mean_token_accuracy": 0.7247508764266968, + "num_tokens": 434993611.0, + "step": 17174 + }, + { + "epoch": 1.8861190423896332, + "grad_norm": 2.038278579711914, + "learning_rate": 1e-06, + "loss": 1.001, + "mean_token_accuracy": 0.690231204032898, + "num_tokens": 435024115.0, + "step": 17175 + }, + { + "epoch": 1.8862288600922468, + "grad_norm": 2.229259490966797, + "learning_rate": 1e-06, + "loss": 0.9585, + "mean_token_accuracy": 0.7051258087158203, + "num_tokens": 435050185.0, + "step": 17176 + }, + { + "epoch": 1.8863386777948605, + "grad_norm": 2.121490955352783, + "learning_rate": 1e-06, + "loss": 0.922, + "mean_token_accuracy": 0.7118586897850037, + "num_tokens": 435076844.0, + "step": 17177 + }, + { + "epoch": 1.8864484954974743, + "grad_norm": 2.1781721115112305, + "learning_rate": 1e-06, + "loss": 0.9139, + "mean_token_accuracy": 0.715575635433197, + "num_tokens": 435103330.0, + "step": 17178 + }, + { + "epoch": 1.8865583132000878, + "grad_norm": 2.1515040397644043, + "learning_rate": 1e-06, + "loss": 0.8861, + "mean_token_accuracy": 0.7229278683662415, + "num_tokens": 435129755.0, + "step": 17179 + }, + { + "epoch": 1.8866681309027014, + "grad_norm": 1.9137641191482544, + "learning_rate": 1e-06, + "loss": 0.8716, + "mean_token_accuracy": 0.7269731760025024, + "num_tokens": 435160418.0, + "step": 17180 + }, + { + "epoch": 1.8867779486053151, + "grad_norm": 2.0746207237243652, + "learning_rate": 1e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.690772533416748, + "num_tokens": 435192850.0, + "step": 17181 + }, + { + "epoch": 1.8868877663079289, + "grad_norm": 1.9864598512649536, + "learning_rate": 1e-06, + "loss": 0.9673, + "mean_token_accuracy": 0.7015448808670044, + "num_tokens": 435225762.0, + "step": 17182 + }, + { + "epoch": 1.8869975840105426, + "grad_norm": 2.564671277999878, + "learning_rate": 1e-06, + "loss": 0.8344, + "mean_token_accuracy": 0.7323868274688721, + "num_tokens": 435246084.0, + "step": 17183 + }, + { + "epoch": 1.8871074017131562, + "grad_norm": 2.160346269607544, + "learning_rate": 1e-06, + "loss": 0.8464, + "mean_token_accuracy": 0.740109384059906, + "num_tokens": 435272678.0, + "step": 17184 + }, + { + "epoch": 1.8872172194157697, + "grad_norm": 2.2860734462738037, + "learning_rate": 1e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.6990600824356079, + "num_tokens": 435299223.0, + "step": 17185 + }, + { + "epoch": 1.8873270371183835, + "grad_norm": 2.5730223655700684, + "learning_rate": 1e-06, + "loss": 0.8783, + "mean_token_accuracy": 0.7307188510894775, + "num_tokens": 435319751.0, + "step": 17186 + }, + { + "epoch": 1.8874368548209972, + "grad_norm": 2.23396372795105, + "learning_rate": 1e-06, + "loss": 0.8764, + "mean_token_accuracy": 0.7190513014793396, + "num_tokens": 435345388.0, + "step": 17187 + }, + { + "epoch": 1.887546672523611, + "grad_norm": 2.1145284175872803, + "learning_rate": 1e-06, + "loss": 0.8657, + "mean_token_accuracy": 0.7291528582572937, + "num_tokens": 435372395.0, + "step": 17188 + }, + { + "epoch": 1.8876564902262245, + "grad_norm": 2.524674892425537, + "learning_rate": 1e-06, + "loss": 0.8269, + "mean_token_accuracy": 0.7387036085128784, + "num_tokens": 435393190.0, + "step": 17189 + }, + { + "epoch": 1.887766307928838, + "grad_norm": 2.374387264251709, + "learning_rate": 1e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.6897574663162231, + "num_tokens": 435417970.0, + "step": 17190 + }, + { + "epoch": 1.8878761256314518, + "grad_norm": 2.166910171508789, + "learning_rate": 1e-06, + "loss": 0.8308, + "mean_token_accuracy": 0.7312801480293274, + "num_tokens": 435444644.0, + "step": 17191 + }, + { + "epoch": 1.8879859433340656, + "grad_norm": 2.02695631980896, + "learning_rate": 1e-06, + "loss": 0.9183, + "mean_token_accuracy": 0.7121482491493225, + "num_tokens": 435475105.0, + "step": 17192 + }, + { + "epoch": 1.888095761036679, + "grad_norm": 2.274582624435425, + "learning_rate": 1e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7329555749893188, + "num_tokens": 435500011.0, + "step": 17193 + }, + { + "epoch": 1.8882055787392926, + "grad_norm": 2.131983757019043, + "learning_rate": 1e-06, + "loss": 0.907, + "mean_token_accuracy": 0.719926118850708, + "num_tokens": 435527685.0, + "step": 17194 + }, + { + "epoch": 1.8883153964419064, + "grad_norm": 2.263747453689575, + "learning_rate": 1e-06, + "loss": 0.9995, + "mean_token_accuracy": 0.7118891477584839, + "num_tokens": 435552332.0, + "step": 17195 + }, + { + "epoch": 1.8884252141445201, + "grad_norm": 2.127413034439087, + "learning_rate": 1e-06, + "loss": 0.8538, + "mean_token_accuracy": 0.7274267673492432, + "num_tokens": 435578682.0, + "step": 17196 + }, + { + "epoch": 1.888535031847134, + "grad_norm": 2.445411205291748, + "learning_rate": 1e-06, + "loss": 0.8447, + "mean_token_accuracy": 0.7338738441467285, + "num_tokens": 435598765.0, + "step": 17197 + }, + { + "epoch": 1.8886448495497474, + "grad_norm": 2.32010817527771, + "learning_rate": 1e-06, + "loss": 0.9218, + "mean_token_accuracy": 0.7097388505935669, + "num_tokens": 435623715.0, + "step": 17198 + }, + { + "epoch": 1.888754667252361, + "grad_norm": 2.310692310333252, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.6905003190040588, + "num_tokens": 435653167.0, + "step": 17199 + }, + { + "epoch": 1.8888644849549747, + "grad_norm": 2.1235365867614746, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7274852991104126, + "num_tokens": 435680136.0, + "step": 17200 + }, + { + "epoch": 1.8889743026575885, + "grad_norm": 2.4031121730804443, + "learning_rate": 1e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7098633050918579, + "num_tokens": 435703848.0, + "step": 17201 + }, + { + "epoch": 1.889084120360202, + "grad_norm": 2.435570478439331, + "learning_rate": 1e-06, + "loss": 0.8319, + "mean_token_accuracy": 0.7405924797058105, + "num_tokens": 435724793.0, + "step": 17202 + }, + { + "epoch": 1.8891939380628158, + "grad_norm": 2.159182071685791, + "learning_rate": 1e-06, + "loss": 1.0, + "mean_token_accuracy": 0.6916590929031372, + "num_tokens": 435753339.0, + "step": 17203 + }, + { + "epoch": 1.8893037557654293, + "grad_norm": 2.353956699371338, + "learning_rate": 1e-06, + "loss": 0.8305, + "mean_token_accuracy": 0.7403242588043213, + "num_tokens": 435776369.0, + "step": 17204 + }, + { + "epoch": 1.889413573468043, + "grad_norm": 2.121297836303711, + "learning_rate": 1e-06, + "loss": 0.9128, + "mean_token_accuracy": 0.715755820274353, + "num_tokens": 435807417.0, + "step": 17205 + }, + { + "epoch": 1.8895233911706568, + "grad_norm": 2.5091707706451416, + "learning_rate": 1e-06, + "loss": 0.9356, + "mean_token_accuracy": 0.7126508951187134, + "num_tokens": 435829071.0, + "step": 17206 + }, + { + "epoch": 1.8896332088732704, + "grad_norm": 2.1215105056762695, + "learning_rate": 1e-06, + "loss": 0.8109, + "mean_token_accuracy": 0.7425376176834106, + "num_tokens": 435854558.0, + "step": 17207 + }, + { + "epoch": 1.8897430265758839, + "grad_norm": 2.193385362625122, + "learning_rate": 1e-06, + "loss": 0.863, + "mean_token_accuracy": 0.7289185523986816, + "num_tokens": 435880175.0, + "step": 17208 + }, + { + "epoch": 1.8898528442784976, + "grad_norm": 2.2636592388153076, + "learning_rate": 1e-06, + "loss": 0.8141, + "mean_token_accuracy": 0.7411604523658752, + "num_tokens": 435905270.0, + "step": 17209 + }, + { + "epoch": 1.8899626619811114, + "grad_norm": 1.9551746845245361, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.708091139793396, + "num_tokens": 435937300.0, + "step": 17210 + }, + { + "epoch": 1.8900724796837252, + "grad_norm": 2.3775887489318848, + "learning_rate": 1e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.7266753315925598, + "num_tokens": 435962200.0, + "step": 17211 + }, + { + "epoch": 1.8901822973863387, + "grad_norm": 2.152055501937866, + "learning_rate": 1e-06, + "loss": 0.8879, + "mean_token_accuracy": 0.734391450881958, + "num_tokens": 435987943.0, + "step": 17212 + }, + { + "epoch": 1.8902921150889522, + "grad_norm": 2.3890085220336914, + "learning_rate": 1e-06, + "loss": 0.8481, + "mean_token_accuracy": 0.7457926273345947, + "num_tokens": 436009905.0, + "step": 17213 + }, + { + "epoch": 1.890401932791566, + "grad_norm": 2.7404401302337646, + "learning_rate": 1e-06, + "loss": 0.8369, + "mean_token_accuracy": 0.7401840686798096, + "num_tokens": 436028665.0, + "step": 17214 + }, + { + "epoch": 1.8905117504941797, + "grad_norm": 2.483482599258423, + "learning_rate": 1e-06, + "loss": 0.8704, + "mean_token_accuracy": 0.730739176273346, + "num_tokens": 436048682.0, + "step": 17215 + }, + { + "epoch": 1.8906215681967933, + "grad_norm": 2.366482973098755, + "learning_rate": 1e-06, + "loss": 0.8141, + "mean_token_accuracy": 0.7385985851287842, + "num_tokens": 436071083.0, + "step": 17216 + }, + { + "epoch": 1.890731385899407, + "grad_norm": 2.321741819381714, + "learning_rate": 1e-06, + "loss": 0.8805, + "mean_token_accuracy": 0.7221155762672424, + "num_tokens": 436096579.0, + "step": 17217 + }, + { + "epoch": 1.8908412036020206, + "grad_norm": 2.3595097064971924, + "learning_rate": 1e-06, + "loss": 0.8474, + "mean_token_accuracy": 0.7300546169281006, + "num_tokens": 436118046.0, + "step": 17218 + }, + { + "epoch": 1.8909510213046343, + "grad_norm": 2.2688775062561035, + "learning_rate": 1e-06, + "loss": 0.8862, + "mean_token_accuracy": 0.7226019501686096, + "num_tokens": 436141663.0, + "step": 17219 + }, + { + "epoch": 1.891060839007248, + "grad_norm": 2.2838222980499268, + "learning_rate": 1e-06, + "loss": 0.8758, + "mean_token_accuracy": 0.7285451292991638, + "num_tokens": 436165321.0, + "step": 17220 + }, + { + "epoch": 1.8911706567098616, + "grad_norm": 2.1185264587402344, + "learning_rate": 1e-06, + "loss": 0.9128, + "mean_token_accuracy": 0.7204874753952026, + "num_tokens": 436192654.0, + "step": 17221 + }, + { + "epoch": 1.8912804744124752, + "grad_norm": 2.2351555824279785, + "learning_rate": 1e-06, + "loss": 0.8676, + "mean_token_accuracy": 0.7275886535644531, + "num_tokens": 436216842.0, + "step": 17222 + }, + { + "epoch": 1.891390292115089, + "grad_norm": 2.2775204181671143, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.723092257976532, + "num_tokens": 436241846.0, + "step": 17223 + }, + { + "epoch": 1.8915001098177027, + "grad_norm": 2.028383493423462, + "learning_rate": 1e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.7004503011703491, + "num_tokens": 436270431.0, + "step": 17224 + }, + { + "epoch": 1.8916099275203164, + "grad_norm": 2.252333641052246, + "learning_rate": 1e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7089846730232239, + "num_tokens": 436298856.0, + "step": 17225 + }, + { + "epoch": 1.89171974522293, + "grad_norm": 2.4194345474243164, + "learning_rate": 1e-06, + "loss": 0.9229, + "mean_token_accuracy": 0.7150524854660034, + "num_tokens": 436321384.0, + "step": 17226 + }, + { + "epoch": 1.8918295629255435, + "grad_norm": 2.086132526397705, + "learning_rate": 1e-06, + "loss": 0.9195, + "mean_token_accuracy": 0.7142525911331177, + "num_tokens": 436349688.0, + "step": 17227 + }, + { + "epoch": 1.8919393806281573, + "grad_norm": 2.299213409423828, + "learning_rate": 1e-06, + "loss": 0.9281, + "mean_token_accuracy": 0.7139320969581604, + "num_tokens": 436372849.0, + "step": 17228 + }, + { + "epoch": 1.892049198330771, + "grad_norm": 2.192183494567871, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7117300033569336, + "num_tokens": 436398197.0, + "step": 17229 + }, + { + "epoch": 1.8921590160333845, + "grad_norm": 2.4209654331207275, + "learning_rate": 1e-06, + "loss": 0.8645, + "mean_token_accuracy": 0.736470103263855, + "num_tokens": 436420303.0, + "step": 17230 + }, + { + "epoch": 1.892268833735998, + "grad_norm": 2.5386579036712646, + "learning_rate": 1e-06, + "loss": 0.8638, + "mean_token_accuracy": 0.7265100479125977, + "num_tokens": 436441374.0, + "step": 17231 + }, + { + "epoch": 1.8923786514386118, + "grad_norm": 2.120509624481201, + "learning_rate": 1e-06, + "loss": 0.8728, + "mean_token_accuracy": 0.7254629135131836, + "num_tokens": 436469557.0, + "step": 17232 + }, + { + "epoch": 1.8924884691412256, + "grad_norm": 2.227543830871582, + "learning_rate": 1e-06, + "loss": 0.8663, + "mean_token_accuracy": 0.7421814203262329, + "num_tokens": 436493379.0, + "step": 17233 + }, + { + "epoch": 1.8925982868438394, + "grad_norm": 2.0544002056121826, + "learning_rate": 1e-06, + "loss": 0.7903, + "mean_token_accuracy": 0.7474364042282104, + "num_tokens": 436519946.0, + "step": 17234 + }, + { + "epoch": 1.8927081045464529, + "grad_norm": 2.1706576347351074, + "learning_rate": 1e-06, + "loss": 0.903, + "mean_token_accuracy": 0.722354531288147, + "num_tokens": 436549418.0, + "step": 17235 + }, + { + "epoch": 1.8928179222490664, + "grad_norm": 2.2144718170166016, + "learning_rate": 1e-06, + "loss": 0.8069, + "mean_token_accuracy": 0.7393238544464111, + "num_tokens": 436573809.0, + "step": 17236 + }, + { + "epoch": 1.8929277399516802, + "grad_norm": 2.1680991649627686, + "learning_rate": 1e-06, + "loss": 0.8239, + "mean_token_accuracy": 0.7398778796195984, + "num_tokens": 436602113.0, + "step": 17237 + }, + { + "epoch": 1.893037557654294, + "grad_norm": 2.2393534183502197, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.7135864496231079, + "num_tokens": 436627851.0, + "step": 17238 + }, + { + "epoch": 1.8931473753569077, + "grad_norm": 2.3444647789001465, + "learning_rate": 1e-06, + "loss": 0.875, + "mean_token_accuracy": 0.7279489636421204, + "num_tokens": 436651618.0, + "step": 17239 + }, + { + "epoch": 1.8932571930595212, + "grad_norm": 2.0023932456970215, + "learning_rate": 1e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.7044093012809753, + "num_tokens": 436682324.0, + "step": 17240 + }, + { + "epoch": 1.8933670107621348, + "grad_norm": 2.2559263706207275, + "learning_rate": 1e-06, + "loss": 0.8178, + "mean_token_accuracy": 0.7415085434913635, + "num_tokens": 436705161.0, + "step": 17241 + }, + { + "epoch": 1.8934768284647485, + "grad_norm": 2.2000296115875244, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.7070614695549011, + "num_tokens": 436733752.0, + "step": 17242 + }, + { + "epoch": 1.8935866461673623, + "grad_norm": 2.4248812198638916, + "learning_rate": 1e-06, + "loss": 0.8251, + "mean_token_accuracy": 0.7359155416488647, + "num_tokens": 436756719.0, + "step": 17243 + }, + { + "epoch": 1.8936964638699758, + "grad_norm": 2.2089853286743164, + "learning_rate": 1e-06, + "loss": 0.8637, + "mean_token_accuracy": 0.7274786829948425, + "num_tokens": 436781019.0, + "step": 17244 + }, + { + "epoch": 1.8938062815725893, + "grad_norm": 2.353060007095337, + "learning_rate": 1e-06, + "loss": 0.9101, + "mean_token_accuracy": 0.7164018750190735, + "num_tokens": 436805058.0, + "step": 17245 + }, + { + "epoch": 1.893916099275203, + "grad_norm": 2.3874359130859375, + "learning_rate": 1e-06, + "loss": 0.8456, + "mean_token_accuracy": 0.7355407476425171, + "num_tokens": 436828177.0, + "step": 17246 + }, + { + "epoch": 1.8940259169778169, + "grad_norm": 2.144101619720459, + "learning_rate": 1e-06, + "loss": 1.0012, + "mean_token_accuracy": 0.7017248868942261, + "num_tokens": 436857217.0, + "step": 17247 + }, + { + "epoch": 1.8941357346804306, + "grad_norm": 2.2343809604644775, + "learning_rate": 1e-06, + "loss": 0.9229, + "mean_token_accuracy": 0.7125437259674072, + "num_tokens": 436884907.0, + "step": 17248 + }, + { + "epoch": 1.8942455523830442, + "grad_norm": 2.314692974090576, + "learning_rate": 1e-06, + "loss": 0.8179, + "mean_token_accuracy": 0.7372728586196899, + "num_tokens": 436909014.0, + "step": 17249 + }, + { + "epoch": 1.8943553700856577, + "grad_norm": 2.3284966945648193, + "learning_rate": 1e-06, + "loss": 0.8572, + "mean_token_accuracy": 0.7261772751808167, + "num_tokens": 436931500.0, + "step": 17250 + }, + { + "epoch": 1.8944651877882714, + "grad_norm": 2.0479395389556885, + "learning_rate": 1e-06, + "loss": 0.9089, + "mean_token_accuracy": 0.7198512554168701, + "num_tokens": 436961359.0, + "step": 17251 + }, + { + "epoch": 1.8945750054908852, + "grad_norm": 2.122363805770874, + "learning_rate": 1e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7003003358840942, + "num_tokens": 436990338.0, + "step": 17252 + }, + { + "epoch": 1.894684823193499, + "grad_norm": 2.0505387783050537, + "learning_rate": 1e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.7213979959487915, + "num_tokens": 437020187.0, + "step": 17253 + }, + { + "epoch": 1.8947946408961125, + "grad_norm": 2.431380033493042, + "learning_rate": 1e-06, + "loss": 0.7317, + "mean_token_accuracy": 0.7669927477836609, + "num_tokens": 437041262.0, + "step": 17254 + }, + { + "epoch": 1.894904458598726, + "grad_norm": 2.420525550842285, + "learning_rate": 1e-06, + "loss": 0.9026, + "mean_token_accuracy": 0.723494291305542, + "num_tokens": 437063232.0, + "step": 17255 + }, + { + "epoch": 1.8950142763013398, + "grad_norm": 2.0573413372039795, + "learning_rate": 1e-06, + "loss": 0.8818, + "mean_token_accuracy": 0.7245357632637024, + "num_tokens": 437092809.0, + "step": 17256 + }, + { + "epoch": 1.8951240940039535, + "grad_norm": 2.3407318592071533, + "learning_rate": 1e-06, + "loss": 0.8924, + "mean_token_accuracy": 0.7177302837371826, + "num_tokens": 437116122.0, + "step": 17257 + }, + { + "epoch": 1.895233911706567, + "grad_norm": 2.198666572570801, + "learning_rate": 1e-06, + "loss": 0.8835, + "mean_token_accuracy": 0.7223594188690186, + "num_tokens": 437141571.0, + "step": 17258 + }, + { + "epoch": 1.8953437294091806, + "grad_norm": 2.406630516052246, + "learning_rate": 1e-06, + "loss": 0.8747, + "mean_token_accuracy": 0.7240300178527832, + "num_tokens": 437164343.0, + "step": 17259 + }, + { + "epoch": 1.8954535471117944, + "grad_norm": 2.2757174968719482, + "learning_rate": 1e-06, + "loss": 0.8489, + "mean_token_accuracy": 0.729383111000061, + "num_tokens": 437188877.0, + "step": 17260 + }, + { + "epoch": 1.8955633648144081, + "grad_norm": 2.3300206661224365, + "learning_rate": 1e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.7082605361938477, + "num_tokens": 437216881.0, + "step": 17261 + }, + { + "epoch": 1.8956731825170219, + "grad_norm": 2.2916977405548096, + "learning_rate": 1e-06, + "loss": 0.7839, + "mean_token_accuracy": 0.7544435858726501, + "num_tokens": 437238944.0, + "step": 17262 + }, + { + "epoch": 1.8957830002196354, + "grad_norm": 2.2727274894714355, + "learning_rate": 1e-06, + "loss": 1.013, + "mean_token_accuracy": 0.6970477104187012, + "num_tokens": 437263467.0, + "step": 17263 + }, + { + "epoch": 1.895892817922249, + "grad_norm": 2.3458096981048584, + "learning_rate": 1e-06, + "loss": 0.9135, + "mean_token_accuracy": 0.7134206891059875, + "num_tokens": 437287777.0, + "step": 17264 + }, + { + "epoch": 1.8960026356248627, + "grad_norm": 2.105609893798828, + "learning_rate": 1e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7201815247535706, + "num_tokens": 437315504.0, + "step": 17265 + }, + { + "epoch": 1.8961124533274765, + "grad_norm": 2.0242977142333984, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7106798887252808, + "num_tokens": 437344412.0, + "step": 17266 + }, + { + "epoch": 1.89622227103009, + "grad_norm": 2.507777452468872, + "learning_rate": 1e-06, + "loss": 0.8438, + "mean_token_accuracy": 0.7280983328819275, + "num_tokens": 437364553.0, + "step": 17267 + }, + { + "epoch": 1.8963320887327038, + "grad_norm": 2.2119882106781006, + "learning_rate": 1e-06, + "loss": 0.8795, + "mean_token_accuracy": 0.7231308817863464, + "num_tokens": 437390096.0, + "step": 17268 + }, + { + "epoch": 1.8964419064353173, + "grad_norm": 2.4124724864959717, + "learning_rate": 1e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.70637047290802, + "num_tokens": 437414735.0, + "step": 17269 + }, + { + "epoch": 1.896551724137931, + "grad_norm": 2.0346734523773193, + "learning_rate": 1e-06, + "loss": 0.8211, + "mean_token_accuracy": 0.7390261888504028, + "num_tokens": 437443423.0, + "step": 17270 + }, + { + "epoch": 1.8966615418405448, + "grad_norm": 2.402780055999756, + "learning_rate": 1e-06, + "loss": 0.9672, + "mean_token_accuracy": 0.7005035281181335, + "num_tokens": 437467691.0, + "step": 17271 + }, + { + "epoch": 1.8967713595431583, + "grad_norm": 2.2952327728271484, + "learning_rate": 1e-06, + "loss": 0.8598, + "mean_token_accuracy": 0.7305077314376831, + "num_tokens": 437491901.0, + "step": 17272 + }, + { + "epoch": 1.8968811772457719, + "grad_norm": 2.2107441425323486, + "learning_rate": 1e-06, + "loss": 0.9075, + "mean_token_accuracy": 0.7302905321121216, + "num_tokens": 437517505.0, + "step": 17273 + }, + { + "epoch": 1.8969909949483856, + "grad_norm": 2.2700328826904297, + "learning_rate": 1e-06, + "loss": 0.8633, + "mean_token_accuracy": 0.7289969325065613, + "num_tokens": 437541551.0, + "step": 17274 + }, + { + "epoch": 1.8971008126509994, + "grad_norm": 2.571622371673584, + "learning_rate": 1e-06, + "loss": 0.9137, + "mean_token_accuracy": 0.7198583483695984, + "num_tokens": 437563558.0, + "step": 17275 + }, + { + "epoch": 1.8972106303536131, + "grad_norm": 2.4846837520599365, + "learning_rate": 1e-06, + "loss": 0.9135, + "mean_token_accuracy": 0.7155956029891968, + "num_tokens": 437586187.0, + "step": 17276 + }, + { + "epoch": 1.8973204480562267, + "grad_norm": 2.658963680267334, + "learning_rate": 1e-06, + "loss": 0.9188, + "mean_token_accuracy": 0.719672441482544, + "num_tokens": 437607039.0, + "step": 17277 + }, + { + "epoch": 1.8974302657588402, + "grad_norm": 1.9757368564605713, + "learning_rate": 1e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.6998997926712036, + "num_tokens": 437637490.0, + "step": 17278 + }, + { + "epoch": 1.897540083461454, + "grad_norm": 2.337461471557617, + "learning_rate": 1e-06, + "loss": 0.8293, + "mean_token_accuracy": 0.7382946610450745, + "num_tokens": 437662063.0, + "step": 17279 + }, + { + "epoch": 1.8976499011640677, + "grad_norm": 2.102525472640991, + "learning_rate": 1e-06, + "loss": 0.8055, + "mean_token_accuracy": 0.7448077201843262, + "num_tokens": 437688487.0, + "step": 17280 + }, + { + "epoch": 1.8977597188666813, + "grad_norm": 2.56735897064209, + "learning_rate": 1e-06, + "loss": 0.8403, + "mean_token_accuracy": 0.7341785430908203, + "num_tokens": 437707991.0, + "step": 17281 + }, + { + "epoch": 1.897869536569295, + "grad_norm": 2.258910655975342, + "learning_rate": 1e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.7206369638442993, + "num_tokens": 437734031.0, + "step": 17282 + }, + { + "epoch": 1.8979793542719086, + "grad_norm": 2.327854871749878, + "learning_rate": 1e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.7139616012573242, + "num_tokens": 437758182.0, + "step": 17283 + }, + { + "epoch": 1.8980891719745223, + "grad_norm": 2.2751240730285645, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.709706723690033, + "num_tokens": 437786443.0, + "step": 17284 + }, + { + "epoch": 1.898198989677136, + "grad_norm": 2.2805097103118896, + "learning_rate": 1e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.7158148288726807, + "num_tokens": 437811480.0, + "step": 17285 + }, + { + "epoch": 1.8983088073797496, + "grad_norm": 2.1997029781341553, + "learning_rate": 1e-06, + "loss": 0.8786, + "mean_token_accuracy": 0.7274428606033325, + "num_tokens": 437838588.0, + "step": 17286 + }, + { + "epoch": 1.8984186250823631, + "grad_norm": 2.0977020263671875, + "learning_rate": 1e-06, + "loss": 0.8466, + "mean_token_accuracy": 0.732982337474823, + "num_tokens": 437867090.0, + "step": 17287 + }, + { + "epoch": 1.898528442784977, + "grad_norm": 2.303823947906494, + "learning_rate": 1e-06, + "loss": 0.8875, + "mean_token_accuracy": 0.7239159345626831, + "num_tokens": 437892958.0, + "step": 17288 + }, + { + "epoch": 1.8986382604875907, + "grad_norm": 2.065751791000366, + "learning_rate": 1e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.7205644845962524, + "num_tokens": 437924220.0, + "step": 17289 + }, + { + "epoch": 1.8987480781902044, + "grad_norm": 2.150632858276367, + "learning_rate": 1e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.7065651416778564, + "num_tokens": 437954559.0, + "step": 17290 + }, + { + "epoch": 1.898857895892818, + "grad_norm": 2.4622042179107666, + "learning_rate": 1e-06, + "loss": 0.8107, + "mean_token_accuracy": 0.7432005405426025, + "num_tokens": 437976348.0, + "step": 17291 + }, + { + "epoch": 1.8989677135954315, + "grad_norm": 2.579437017440796, + "learning_rate": 1e-06, + "loss": 0.8207, + "mean_token_accuracy": 0.736065685749054, + "num_tokens": 437996879.0, + "step": 17292 + }, + { + "epoch": 1.8990775312980452, + "grad_norm": 2.1847245693206787, + "learning_rate": 1e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.7112930417060852, + "num_tokens": 438024557.0, + "step": 17293 + }, + { + "epoch": 1.899187349000659, + "grad_norm": 2.8014707565307617, + "learning_rate": 1e-06, + "loss": 0.8542, + "mean_token_accuracy": 0.732850193977356, + "num_tokens": 438042867.0, + "step": 17294 + }, + { + "epoch": 1.8992971667032725, + "grad_norm": 2.0602850914001465, + "learning_rate": 1e-06, + "loss": 0.907, + "mean_token_accuracy": 0.7154396772384644, + "num_tokens": 438074040.0, + "step": 17295 + }, + { + "epoch": 1.899406984405886, + "grad_norm": 2.496724843978882, + "learning_rate": 1e-06, + "loss": 0.8625, + "mean_token_accuracy": 0.7234398126602173, + "num_tokens": 438095401.0, + "step": 17296 + }, + { + "epoch": 1.8995168021084998, + "grad_norm": 2.7039761543273926, + "learning_rate": 1e-06, + "loss": 0.8111, + "mean_token_accuracy": 0.7471760511398315, + "num_tokens": 438114258.0, + "step": 17297 + }, + { + "epoch": 1.8996266198111136, + "grad_norm": 2.5878043174743652, + "learning_rate": 1e-06, + "loss": 0.8279, + "mean_token_accuracy": 0.7361780405044556, + "num_tokens": 438133406.0, + "step": 17298 + }, + { + "epoch": 1.8997364375137273, + "grad_norm": 2.822291851043701, + "learning_rate": 1e-06, + "loss": 0.8705, + "mean_token_accuracy": 0.7290109992027283, + "num_tokens": 438151775.0, + "step": 17299 + }, + { + "epoch": 1.8998462552163409, + "grad_norm": 2.0809199810028076, + "learning_rate": 1e-06, + "loss": 0.9724, + "mean_token_accuracy": 0.7013492584228516, + "num_tokens": 438181852.0, + "step": 17300 + }, + { + "epoch": 1.8999560729189544, + "grad_norm": 2.257157325744629, + "learning_rate": 1e-06, + "loss": 0.8931, + "mean_token_accuracy": 0.7229904532432556, + "num_tokens": 438206391.0, + "step": 17301 + }, + { + "epoch": 1.9000658906215682, + "grad_norm": 2.321110248565674, + "learning_rate": 1e-06, + "loss": 0.9104, + "mean_token_accuracy": 0.710206151008606, + "num_tokens": 438230607.0, + "step": 17302 + }, + { + "epoch": 1.900175708324182, + "grad_norm": 2.11080002784729, + "learning_rate": 1e-06, + "loss": 0.8715, + "mean_token_accuracy": 0.7229543924331665, + "num_tokens": 438259736.0, + "step": 17303 + }, + { + "epoch": 1.9002855260267957, + "grad_norm": 2.149691343307495, + "learning_rate": 1e-06, + "loss": 0.9038, + "mean_token_accuracy": 0.7154279947280884, + "num_tokens": 438286813.0, + "step": 17304 + }, + { + "epoch": 1.9003953437294092, + "grad_norm": 2.210559606552124, + "learning_rate": 1e-06, + "loss": 0.8741, + "mean_token_accuracy": 0.7248198986053467, + "num_tokens": 438312386.0, + "step": 17305 + }, + { + "epoch": 1.9005051614320227, + "grad_norm": 2.345327138900757, + "learning_rate": 1e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.7105625867843628, + "num_tokens": 438336267.0, + "step": 17306 + }, + { + "epoch": 1.9006149791346365, + "grad_norm": 2.2024471759796143, + "learning_rate": 1e-06, + "loss": 0.8021, + "mean_token_accuracy": 0.7482571601867676, + "num_tokens": 438359557.0, + "step": 17307 + }, + { + "epoch": 1.9007247968372503, + "grad_norm": 2.1603105068206787, + "learning_rate": 1e-06, + "loss": 0.7979, + "mean_token_accuracy": 0.7474178075790405, + "num_tokens": 438384815.0, + "step": 17308 + }, + { + "epoch": 1.9008346145398638, + "grad_norm": 2.203305244445801, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7076829075813293, + "num_tokens": 438412054.0, + "step": 17309 + }, + { + "epoch": 1.9009444322424773, + "grad_norm": 2.2255196571350098, + "learning_rate": 1e-06, + "loss": 0.8675, + "mean_token_accuracy": 0.733970046043396, + "num_tokens": 438437367.0, + "step": 17310 + }, + { + "epoch": 1.901054249945091, + "grad_norm": 2.1323602199554443, + "learning_rate": 1e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.7073708772659302, + "num_tokens": 438465971.0, + "step": 17311 + }, + { + "epoch": 1.9011640676477048, + "grad_norm": 2.2109806537628174, + "learning_rate": 1e-06, + "loss": 0.8407, + "mean_token_accuracy": 0.7390369772911072, + "num_tokens": 438491213.0, + "step": 17312 + }, + { + "epoch": 1.9012738853503186, + "grad_norm": 2.3562958240509033, + "learning_rate": 1e-06, + "loss": 0.9358, + "mean_token_accuracy": 0.7183452248573303, + "num_tokens": 438514033.0, + "step": 17313 + }, + { + "epoch": 1.9013837030529321, + "grad_norm": 2.173264980316162, + "learning_rate": 1e-06, + "loss": 0.9024, + "mean_token_accuracy": 0.7158190011978149, + "num_tokens": 438542555.0, + "step": 17314 + }, + { + "epoch": 1.9014935207555457, + "grad_norm": 2.4203848838806152, + "learning_rate": 1e-06, + "loss": 0.8698, + "mean_token_accuracy": 0.7323383092880249, + "num_tokens": 438564577.0, + "step": 17315 + }, + { + "epoch": 1.9016033384581594, + "grad_norm": 2.6701583862304688, + "learning_rate": 1e-06, + "loss": 0.7695, + "mean_token_accuracy": 0.7614425420761108, + "num_tokens": 438583128.0, + "step": 17316 + }, + { + "epoch": 1.9017131561607732, + "grad_norm": 2.127612590789795, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7184532284736633, + "num_tokens": 438611304.0, + "step": 17317 + }, + { + "epoch": 1.9018229738633867, + "grad_norm": 2.325007438659668, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.715239942073822, + "num_tokens": 438634610.0, + "step": 17318 + }, + { + "epoch": 1.9019327915660005, + "grad_norm": 2.446394920349121, + "learning_rate": 1e-06, + "loss": 0.8742, + "mean_token_accuracy": 0.7295379042625427, + "num_tokens": 438656170.0, + "step": 17319 + }, + { + "epoch": 1.902042609268614, + "grad_norm": 2.053104877471924, + "learning_rate": 1e-06, + "loss": 0.9032, + "mean_token_accuracy": 0.7227283716201782, + "num_tokens": 438684831.0, + "step": 17320 + }, + { + "epoch": 1.9021524269712278, + "grad_norm": 2.374128818511963, + "learning_rate": 1e-06, + "loss": 0.9026, + "mean_token_accuracy": 0.7279154062271118, + "num_tokens": 438707733.0, + "step": 17321 + }, + { + "epoch": 1.9022622446738415, + "grad_norm": 2.4040403366088867, + "learning_rate": 1e-06, + "loss": 0.9327, + "mean_token_accuracy": 0.7200907468795776, + "num_tokens": 438730859.0, + "step": 17322 + }, + { + "epoch": 1.902372062376455, + "grad_norm": 2.043036460876465, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.7004699110984802, + "num_tokens": 438761872.0, + "step": 17323 + }, + { + "epoch": 1.9024818800790686, + "grad_norm": 2.289407730102539, + "learning_rate": 1e-06, + "loss": 0.8393, + "mean_token_accuracy": 0.734329104423523, + "num_tokens": 438786678.0, + "step": 17324 + }, + { + "epoch": 1.9025916977816824, + "grad_norm": 2.213237762451172, + "learning_rate": 1e-06, + "loss": 0.8511, + "mean_token_accuracy": 0.7412481307983398, + "num_tokens": 438810377.0, + "step": 17325 + }, + { + "epoch": 1.9027015154842961, + "grad_norm": 2.4058403968811035, + "learning_rate": 1e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.7191749215126038, + "num_tokens": 438832873.0, + "step": 17326 + }, + { + "epoch": 1.9028113331869099, + "grad_norm": 2.081325054168701, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.716603696346283, + "num_tokens": 438862224.0, + "step": 17327 + }, + { + "epoch": 1.9029211508895234, + "grad_norm": 2.1661598682403564, + "learning_rate": 1e-06, + "loss": 0.9029, + "mean_token_accuracy": 0.7195276618003845, + "num_tokens": 438888983.0, + "step": 17328 + }, + { + "epoch": 1.903030968592137, + "grad_norm": 2.693641185760498, + "learning_rate": 1e-06, + "loss": 0.9279, + "mean_token_accuracy": 0.7089945077896118, + "num_tokens": 438909864.0, + "step": 17329 + }, + { + "epoch": 1.9031407862947507, + "grad_norm": 2.0735020637512207, + "learning_rate": 1e-06, + "loss": 0.8721, + "mean_token_accuracy": 0.7333658933639526, + "num_tokens": 438935756.0, + "step": 17330 + }, + { + "epoch": 1.9032506039973645, + "grad_norm": 2.0121965408325195, + "learning_rate": 1e-06, + "loss": 0.9381, + "mean_token_accuracy": 0.7118739485740662, + "num_tokens": 438968305.0, + "step": 17331 + }, + { + "epoch": 1.903360421699978, + "grad_norm": 2.255627155303955, + "learning_rate": 1e-06, + "loss": 0.8808, + "mean_token_accuracy": 0.7180019021034241, + "num_tokens": 438993335.0, + "step": 17332 + }, + { + "epoch": 1.9034702394025917, + "grad_norm": 2.176823854446411, + "learning_rate": 1e-06, + "loss": 0.9701, + "mean_token_accuracy": 0.6985547542572021, + "num_tokens": 439020835.0, + "step": 17333 + }, + { + "epoch": 1.9035800571052053, + "grad_norm": 2.1348915100097656, + "learning_rate": 1e-06, + "loss": 0.8743, + "mean_token_accuracy": 0.7261895537376404, + "num_tokens": 439049872.0, + "step": 17334 + }, + { + "epoch": 1.903689874807819, + "grad_norm": 2.21065616607666, + "learning_rate": 1e-06, + "loss": 0.9188, + "mean_token_accuracy": 0.719556987285614, + "num_tokens": 439074590.0, + "step": 17335 + }, + { + "epoch": 1.9037996925104328, + "grad_norm": 2.2051944732666016, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7038714289665222, + "num_tokens": 439099767.0, + "step": 17336 + }, + { + "epoch": 1.9039095102130463, + "grad_norm": 2.241150379180908, + "learning_rate": 1e-06, + "loss": 0.8915, + "mean_token_accuracy": 0.7221508622169495, + "num_tokens": 439125779.0, + "step": 17337 + }, + { + "epoch": 1.9040193279156599, + "grad_norm": 2.4391117095947266, + "learning_rate": 1e-06, + "loss": 0.8209, + "mean_token_accuracy": 0.7430778741836548, + "num_tokens": 439147739.0, + "step": 17338 + }, + { + "epoch": 1.9041291456182736, + "grad_norm": 1.920846939086914, + "learning_rate": 1e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.7080999612808228, + "num_tokens": 439183650.0, + "step": 17339 + }, + { + "epoch": 1.9042389633208874, + "grad_norm": 2.3655266761779785, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7243660688400269, + "num_tokens": 439206997.0, + "step": 17340 + }, + { + "epoch": 1.9043487810235011, + "grad_norm": 2.11191725730896, + "learning_rate": 1e-06, + "loss": 0.9198, + "mean_token_accuracy": 0.7110382318496704, + "num_tokens": 439234785.0, + "step": 17341 + }, + { + "epoch": 1.9044585987261147, + "grad_norm": 1.8693894147872925, + "learning_rate": 1e-06, + "loss": 0.878, + "mean_token_accuracy": 0.7365953326225281, + "num_tokens": 439270371.0, + "step": 17342 + }, + { + "epoch": 1.9045684164287282, + "grad_norm": 2.247957706451416, + "learning_rate": 1e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.7259194254875183, + "num_tokens": 439294819.0, + "step": 17343 + }, + { + "epoch": 1.904678234131342, + "grad_norm": 2.070718765258789, + "learning_rate": 1e-06, + "loss": 0.8513, + "mean_token_accuracy": 0.7302625179290771, + "num_tokens": 439320996.0, + "step": 17344 + }, + { + "epoch": 1.9047880518339557, + "grad_norm": 2.1946163177490234, + "learning_rate": 1e-06, + "loss": 0.8451, + "mean_token_accuracy": 0.732524573802948, + "num_tokens": 439346540.0, + "step": 17345 + }, + { + "epoch": 1.9048978695365693, + "grad_norm": 2.1488020420074463, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7191421985626221, + "num_tokens": 439373748.0, + "step": 17346 + }, + { + "epoch": 1.9050076872391828, + "grad_norm": 2.672093391418457, + "learning_rate": 1e-06, + "loss": 0.8014, + "mean_token_accuracy": 0.7404695749282837, + "num_tokens": 439392859.0, + "step": 17347 + }, + { + "epoch": 1.9051175049417965, + "grad_norm": 2.6058735847473145, + "learning_rate": 1e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.7172953486442566, + "num_tokens": 439413355.0, + "step": 17348 + }, + { + "epoch": 1.9052273226444103, + "grad_norm": 2.2638907432556152, + "learning_rate": 1e-06, + "loss": 0.8989, + "mean_token_accuracy": 0.7178249359130859, + "num_tokens": 439438268.0, + "step": 17349 + }, + { + "epoch": 1.905337140347024, + "grad_norm": 2.199361801147461, + "learning_rate": 1e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7194834351539612, + "num_tokens": 439463381.0, + "step": 17350 + }, + { + "epoch": 1.9054469580496376, + "grad_norm": 2.4968152046203613, + "learning_rate": 1e-06, + "loss": 0.8472, + "mean_token_accuracy": 0.7300701141357422, + "num_tokens": 439483921.0, + "step": 17351 + }, + { + "epoch": 1.9055567757522511, + "grad_norm": 2.0517754554748535, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7262386083602905, + "num_tokens": 439514560.0, + "step": 17352 + }, + { + "epoch": 1.9056665934548649, + "grad_norm": 2.2863643169403076, + "learning_rate": 1e-06, + "loss": 0.9012, + "mean_token_accuracy": 0.7215120792388916, + "num_tokens": 439540601.0, + "step": 17353 + }, + { + "epoch": 1.9057764111574786, + "grad_norm": 2.3651516437530518, + "learning_rate": 1e-06, + "loss": 0.8603, + "mean_token_accuracy": 0.7303792238235474, + "num_tokens": 439563828.0, + "step": 17354 + }, + { + "epoch": 1.9058862288600924, + "grad_norm": 1.9898886680603027, + "learning_rate": 1e-06, + "loss": 1.0207, + "mean_token_accuracy": 0.6852993965148926, + "num_tokens": 439595907.0, + "step": 17355 + }, + { + "epoch": 1.905996046562706, + "grad_norm": 2.5539491176605225, + "learning_rate": 1e-06, + "loss": 0.7815, + "mean_token_accuracy": 0.7480874061584473, + "num_tokens": 439616312.0, + "step": 17356 + }, + { + "epoch": 1.9061058642653195, + "grad_norm": 2.3999366760253906, + "learning_rate": 1e-06, + "loss": 0.8275, + "mean_token_accuracy": 0.7396273612976074, + "num_tokens": 439637323.0, + "step": 17357 + }, + { + "epoch": 1.9062156819679332, + "grad_norm": 2.259901523590088, + "learning_rate": 1e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.7008039951324463, + "num_tokens": 439664692.0, + "step": 17358 + }, + { + "epoch": 1.906325499670547, + "grad_norm": 2.225646495819092, + "learning_rate": 1e-06, + "loss": 0.8249, + "mean_token_accuracy": 0.7424255609512329, + "num_tokens": 439688784.0, + "step": 17359 + }, + { + "epoch": 1.9064353173731605, + "grad_norm": 2.162879705429077, + "learning_rate": 1e-06, + "loss": 0.8999, + "mean_token_accuracy": 0.7273683547973633, + "num_tokens": 439715441.0, + "step": 17360 + }, + { + "epoch": 1.906545135075774, + "grad_norm": 2.255958080291748, + "learning_rate": 1e-06, + "loss": 0.8118, + "mean_token_accuracy": 0.7407445311546326, + "num_tokens": 439741311.0, + "step": 17361 + }, + { + "epoch": 1.9066549527783878, + "grad_norm": 2.3622965812683105, + "learning_rate": 1e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.7202141284942627, + "num_tokens": 439766989.0, + "step": 17362 + }, + { + "epoch": 1.9067647704810016, + "grad_norm": 2.5226566791534424, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7255408763885498, + "num_tokens": 439788883.0, + "step": 17363 + }, + { + "epoch": 1.9068745881836153, + "grad_norm": 2.388847827911377, + "learning_rate": 1e-06, + "loss": 0.834, + "mean_token_accuracy": 0.7334822416305542, + "num_tokens": 439810806.0, + "step": 17364 + }, + { + "epoch": 1.9069844058862289, + "grad_norm": 2.2903215885162354, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7141211032867432, + "num_tokens": 439836572.0, + "step": 17365 + }, + { + "epoch": 1.9070942235888424, + "grad_norm": 2.166785955429077, + "learning_rate": 1e-06, + "loss": 0.8422, + "mean_token_accuracy": 0.7500380873680115, + "num_tokens": 439860903.0, + "step": 17366 + }, + { + "epoch": 1.9072040412914562, + "grad_norm": 2.139453649520874, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7021199464797974, + "num_tokens": 439890361.0, + "step": 17367 + }, + { + "epoch": 1.90731385899407, + "grad_norm": 2.1874020099639893, + "learning_rate": 1e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.7218100428581238, + "num_tokens": 439917697.0, + "step": 17368 + }, + { + "epoch": 1.9074236766966837, + "grad_norm": 2.210291624069214, + "learning_rate": 1e-06, + "loss": 0.8093, + "mean_token_accuracy": 0.7397506237030029, + "num_tokens": 439941928.0, + "step": 17369 + }, + { + "epoch": 1.9075334943992972, + "grad_norm": 2.1400883197784424, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7054421901702881, + "num_tokens": 439969793.0, + "step": 17370 + }, + { + "epoch": 1.9076433121019107, + "grad_norm": 1.8788069486618042, + "learning_rate": 1e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.7216553688049316, + "num_tokens": 440005283.0, + "step": 17371 + }, + { + "epoch": 1.9077531298045245, + "grad_norm": 2.0918846130371094, + "learning_rate": 1e-06, + "loss": 1.0457, + "mean_token_accuracy": 0.6866499185562134, + "num_tokens": 440035855.0, + "step": 17372 + }, + { + "epoch": 1.9078629475071383, + "grad_norm": 2.137561321258545, + "learning_rate": 1e-06, + "loss": 0.9627, + "mean_token_accuracy": 0.701250433921814, + "num_tokens": 440062992.0, + "step": 17373 + }, + { + "epoch": 1.9079727652097518, + "grad_norm": 2.2257931232452393, + "learning_rate": 1e-06, + "loss": 0.8462, + "mean_token_accuracy": 0.7329316139221191, + "num_tokens": 440087021.0, + "step": 17374 + }, + { + "epoch": 1.9080825829123653, + "grad_norm": 2.397390127182007, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7185633182525635, + "num_tokens": 440109995.0, + "step": 17375 + }, + { + "epoch": 1.908192400614979, + "grad_norm": 2.162994861602783, + "learning_rate": 1e-06, + "loss": 0.8437, + "mean_token_accuracy": 0.7307230234146118, + "num_tokens": 440135189.0, + "step": 17376 + }, + { + "epoch": 1.9083022183175928, + "grad_norm": 2.23292875289917, + "learning_rate": 1e-06, + "loss": 0.9193, + "mean_token_accuracy": 0.7160672545433044, + "num_tokens": 440161555.0, + "step": 17377 + }, + { + "epoch": 1.9084120360202066, + "grad_norm": 2.6176156997680664, + "learning_rate": 1e-06, + "loss": 0.911, + "mean_token_accuracy": 0.7316029667854309, + "num_tokens": 440182629.0, + "step": 17378 + }, + { + "epoch": 1.9085218537228201, + "grad_norm": 2.653191566467285, + "learning_rate": 1e-06, + "loss": 0.8739, + "mean_token_accuracy": 0.7244570255279541, + "num_tokens": 440201917.0, + "step": 17379 + }, + { + "epoch": 1.9086316714254337, + "grad_norm": 2.1938021183013916, + "learning_rate": 1e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.7158543467521667, + "num_tokens": 440228549.0, + "step": 17380 + }, + { + "epoch": 1.9087414891280474, + "grad_norm": 2.446932554244995, + "learning_rate": 1e-06, + "loss": 0.8939, + "mean_token_accuracy": 0.7225093841552734, + "num_tokens": 440250655.0, + "step": 17381 + }, + { + "epoch": 1.9088513068306612, + "grad_norm": 2.574103832244873, + "learning_rate": 1e-06, + "loss": 0.8962, + "mean_token_accuracy": 0.7204961776733398, + "num_tokens": 440272528.0, + "step": 17382 + }, + { + "epoch": 1.9089611245332747, + "grad_norm": 2.2273929119110107, + "learning_rate": 1e-06, + "loss": 0.8678, + "mean_token_accuracy": 0.7340189218521118, + "num_tokens": 440297085.0, + "step": 17383 + }, + { + "epoch": 1.9090709422358885, + "grad_norm": 2.619300365447998, + "learning_rate": 1e-06, + "loss": 0.7977, + "mean_token_accuracy": 0.7434194087982178, + "num_tokens": 440314899.0, + "step": 17384 + }, + { + "epoch": 1.909180759938502, + "grad_norm": 2.2502694129943848, + "learning_rate": 1e-06, + "loss": 0.8794, + "mean_token_accuracy": 0.717445969581604, + "num_tokens": 440338509.0, + "step": 17385 + }, + { + "epoch": 1.9092905776411158, + "grad_norm": 2.0988242626190186, + "learning_rate": 1e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.7019952535629272, + "num_tokens": 440368907.0, + "step": 17386 + }, + { + "epoch": 1.9094003953437295, + "grad_norm": 2.299546718597412, + "learning_rate": 1e-06, + "loss": 0.8804, + "mean_token_accuracy": 0.7235506772994995, + "num_tokens": 440393975.0, + "step": 17387 + }, + { + "epoch": 1.909510213046343, + "grad_norm": 2.388557195663452, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7156372666358948, + "num_tokens": 440417937.0, + "step": 17388 + }, + { + "epoch": 1.9096200307489566, + "grad_norm": 2.2011308670043945, + "learning_rate": 1e-06, + "loss": 0.9128, + "mean_token_accuracy": 0.7156331539154053, + "num_tokens": 440444397.0, + "step": 17389 + }, + { + "epoch": 1.9097298484515703, + "grad_norm": 2.5107343196868896, + "learning_rate": 1e-06, + "loss": 0.8518, + "mean_token_accuracy": 0.7376613020896912, + "num_tokens": 440465880.0, + "step": 17390 + }, + { + "epoch": 1.909839666154184, + "grad_norm": 2.422736406326294, + "learning_rate": 1e-06, + "loss": 0.8957, + "mean_token_accuracy": 0.7194924354553223, + "num_tokens": 440489481.0, + "step": 17391 + }, + { + "epoch": 1.9099494838567979, + "grad_norm": 2.11552357673645, + "learning_rate": 1e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.7133320569992065, + "num_tokens": 440517224.0, + "step": 17392 + }, + { + "epoch": 1.9100593015594114, + "grad_norm": 2.637669801712036, + "learning_rate": 1e-06, + "loss": 0.8893, + "mean_token_accuracy": 0.7157489061355591, + "num_tokens": 440536303.0, + "step": 17393 + }, + { + "epoch": 1.910169119262025, + "grad_norm": 2.251000165939331, + "learning_rate": 1e-06, + "loss": 0.9386, + "mean_token_accuracy": 0.7128388285636902, + "num_tokens": 440563105.0, + "step": 17394 + }, + { + "epoch": 1.9102789369646387, + "grad_norm": 2.2096781730651855, + "learning_rate": 1e-06, + "loss": 0.8526, + "mean_token_accuracy": 0.7311669588088989, + "num_tokens": 440588077.0, + "step": 17395 + }, + { + "epoch": 1.9103887546672524, + "grad_norm": 2.0487825870513916, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.71478271484375, + "num_tokens": 440618068.0, + "step": 17396 + }, + { + "epoch": 1.910498572369866, + "grad_norm": 2.381223440170288, + "learning_rate": 1e-06, + "loss": 0.7602, + "mean_token_accuracy": 0.7559343576431274, + "num_tokens": 440639647.0, + "step": 17397 + }, + { + "epoch": 1.9106083900724797, + "grad_norm": 2.061481237411499, + "learning_rate": 1e-06, + "loss": 0.9365, + "mean_token_accuracy": 0.7113667726516724, + "num_tokens": 440670347.0, + "step": 17398 + }, + { + "epoch": 1.9107182077750933, + "grad_norm": 2.344564437866211, + "learning_rate": 1e-06, + "loss": 0.8269, + "mean_token_accuracy": 0.7385291457176208, + "num_tokens": 440694765.0, + "step": 17399 + }, + { + "epoch": 1.910828025477707, + "grad_norm": 2.2137691974639893, + "learning_rate": 1e-06, + "loss": 0.8515, + "mean_token_accuracy": 0.7336323857307434, + "num_tokens": 440718828.0, + "step": 17400 + }, + { + "epoch": 1.9109378431803208, + "grad_norm": 2.1850168704986572, + "learning_rate": 1e-06, + "loss": 0.9109, + "mean_token_accuracy": 0.722915768623352, + "num_tokens": 440745235.0, + "step": 17401 + }, + { + "epoch": 1.9110476608829343, + "grad_norm": 2.218672037124634, + "learning_rate": 1e-06, + "loss": 0.9202, + "mean_token_accuracy": 0.7240806818008423, + "num_tokens": 440770084.0, + "step": 17402 + }, + { + "epoch": 1.9111574785855479, + "grad_norm": 2.7073161602020264, + "learning_rate": 1e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.7246967554092407, + "num_tokens": 440788679.0, + "step": 17403 + }, + { + "epoch": 1.9112672962881616, + "grad_norm": 2.151495933532715, + "learning_rate": 1e-06, + "loss": 0.8908, + "mean_token_accuracy": 0.7199628949165344, + "num_tokens": 440816964.0, + "step": 17404 + }, + { + "epoch": 1.9113771139907754, + "grad_norm": 2.520280122756958, + "learning_rate": 1e-06, + "loss": 0.8319, + "mean_token_accuracy": 0.7318736910820007, + "num_tokens": 440837611.0, + "step": 17405 + }, + { + "epoch": 1.9114869316933891, + "grad_norm": 2.4933576583862305, + "learning_rate": 1e-06, + "loss": 0.8422, + "mean_token_accuracy": 0.7341697216033936, + "num_tokens": 440860448.0, + "step": 17406 + }, + { + "epoch": 1.9115967493960027, + "grad_norm": 2.453927993774414, + "learning_rate": 1e-06, + "loss": 0.8342, + "mean_token_accuracy": 0.7357487678527832, + "num_tokens": 440880254.0, + "step": 17407 + }, + { + "epoch": 1.9117065670986162, + "grad_norm": 2.3691506385803223, + "learning_rate": 1e-06, + "loss": 0.8728, + "mean_token_accuracy": 0.7264896631240845, + "num_tokens": 440903048.0, + "step": 17408 + }, + { + "epoch": 1.91181638480123, + "grad_norm": 2.2071893215179443, + "learning_rate": 1e-06, + "loss": 0.8905, + "mean_token_accuracy": 0.7208206057548523, + "num_tokens": 440931696.0, + "step": 17409 + }, + { + "epoch": 1.9119262025038437, + "grad_norm": 2.3119020462036133, + "learning_rate": 1e-06, + "loss": 0.8698, + "mean_token_accuracy": 0.7262120246887207, + "num_tokens": 440955676.0, + "step": 17410 + }, + { + "epoch": 1.9120360202064572, + "grad_norm": 2.371173620223999, + "learning_rate": 1e-06, + "loss": 0.9158, + "mean_token_accuracy": 0.7092550992965698, + "num_tokens": 440978048.0, + "step": 17411 + }, + { + "epoch": 1.9121458379090708, + "grad_norm": 2.2220311164855957, + "learning_rate": 1e-06, + "loss": 0.8575, + "mean_token_accuracy": 0.733740508556366, + "num_tokens": 441004024.0, + "step": 17412 + }, + { + "epoch": 1.9122556556116845, + "grad_norm": 2.092283010482788, + "learning_rate": 1e-06, + "loss": 0.8228, + "mean_token_accuracy": 0.7464573383331299, + "num_tokens": 441030331.0, + "step": 17413 + }, + { + "epoch": 1.9123654733142983, + "grad_norm": 2.3601999282836914, + "learning_rate": 1e-06, + "loss": 0.8471, + "mean_token_accuracy": 0.7333332300186157, + "num_tokens": 441054082.0, + "step": 17414 + }, + { + "epoch": 1.912475291016912, + "grad_norm": 2.241832971572876, + "learning_rate": 1e-06, + "loss": 0.9839, + "mean_token_accuracy": 0.6996103525161743, + "num_tokens": 441080611.0, + "step": 17415 + }, + { + "epoch": 1.9125851087195256, + "grad_norm": 2.3563218116760254, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7225295901298523, + "num_tokens": 441104830.0, + "step": 17416 + }, + { + "epoch": 1.9126949264221391, + "grad_norm": 2.1843388080596924, + "learning_rate": 1e-06, + "loss": 0.8129, + "mean_token_accuracy": 0.7394260168075562, + "num_tokens": 441131213.0, + "step": 17417 + }, + { + "epoch": 1.9128047441247529, + "grad_norm": 2.3312084674835205, + "learning_rate": 1e-06, + "loss": 0.8453, + "mean_token_accuracy": 0.7348951101303101, + "num_tokens": 441155116.0, + "step": 17418 + }, + { + "epoch": 1.9129145618273666, + "grad_norm": 2.156684160232544, + "learning_rate": 1e-06, + "loss": 0.8356, + "mean_token_accuracy": 0.7427847385406494, + "num_tokens": 441180892.0, + "step": 17419 + }, + { + "epoch": 1.9130243795299804, + "grad_norm": 2.6096291542053223, + "learning_rate": 1e-06, + "loss": 0.8432, + "mean_token_accuracy": 0.7315772771835327, + "num_tokens": 441201458.0, + "step": 17420 + }, + { + "epoch": 1.913134197232594, + "grad_norm": 2.4372191429138184, + "learning_rate": 1e-06, + "loss": 0.8297, + "mean_token_accuracy": 0.740210771560669, + "num_tokens": 441223540.0, + "step": 17421 + }, + { + "epoch": 1.9132440149352075, + "grad_norm": 2.273819923400879, + "learning_rate": 1e-06, + "loss": 0.9182, + "mean_token_accuracy": 0.7165409326553345, + "num_tokens": 441250791.0, + "step": 17422 + }, + { + "epoch": 1.9133538326378212, + "grad_norm": 2.4781785011291504, + "learning_rate": 1e-06, + "loss": 0.923, + "mean_token_accuracy": 0.7129151821136475, + "num_tokens": 441275882.0, + "step": 17423 + }, + { + "epoch": 1.913463650340435, + "grad_norm": 2.4409773349761963, + "learning_rate": 1e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7216165065765381, + "num_tokens": 441300072.0, + "step": 17424 + }, + { + "epoch": 1.9135734680430485, + "grad_norm": 2.1869919300079346, + "learning_rate": 1e-06, + "loss": 0.8768, + "mean_token_accuracy": 0.722395658493042, + "num_tokens": 441326427.0, + "step": 17425 + }, + { + "epoch": 1.913683285745662, + "grad_norm": 2.3151698112487793, + "learning_rate": 1e-06, + "loss": 0.8645, + "mean_token_accuracy": 0.730629563331604, + "num_tokens": 441349952.0, + "step": 17426 + }, + { + "epoch": 1.9137931034482758, + "grad_norm": 2.3822500705718994, + "learning_rate": 1e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.7192767858505249, + "num_tokens": 441373499.0, + "step": 17427 + }, + { + "epoch": 1.9139029211508896, + "grad_norm": 1.850821852684021, + "learning_rate": 1e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7149069309234619, + "num_tokens": 441408634.0, + "step": 17428 + }, + { + "epoch": 1.9140127388535033, + "grad_norm": 2.5079989433288574, + "learning_rate": 1e-06, + "loss": 0.9139, + "mean_token_accuracy": 0.7225646376609802, + "num_tokens": 441429826.0, + "step": 17429 + }, + { + "epoch": 1.9141225565561168, + "grad_norm": 2.2145884037017822, + "learning_rate": 1e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.709434986114502, + "num_tokens": 441456962.0, + "step": 17430 + }, + { + "epoch": 1.9142323742587304, + "grad_norm": 2.498793125152588, + "learning_rate": 1e-06, + "loss": 0.9247, + "mean_token_accuracy": 0.7170294523239136, + "num_tokens": 441478455.0, + "step": 17431 + }, + { + "epoch": 1.9143421919613441, + "grad_norm": 2.1606345176696777, + "learning_rate": 1e-06, + "loss": 0.9042, + "mean_token_accuracy": 0.7163168787956238, + "num_tokens": 441504977.0, + "step": 17432 + }, + { + "epoch": 1.914452009663958, + "grad_norm": 2.2145493030548096, + "learning_rate": 1e-06, + "loss": 0.8168, + "mean_token_accuracy": 0.7477709054946899, + "num_tokens": 441530455.0, + "step": 17433 + }, + { + "epoch": 1.9145618273665717, + "grad_norm": 2.405142307281494, + "learning_rate": 1e-06, + "loss": 0.8461, + "mean_token_accuracy": 0.7352731227874756, + "num_tokens": 441552294.0, + "step": 17434 + }, + { + "epoch": 1.9146716450691852, + "grad_norm": 2.4411754608154297, + "learning_rate": 1e-06, + "loss": 0.832, + "mean_token_accuracy": 0.7339721918106079, + "num_tokens": 441573442.0, + "step": 17435 + }, + { + "epoch": 1.9147814627717987, + "grad_norm": 2.310809373855591, + "learning_rate": 1e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7008147835731506, + "num_tokens": 441598513.0, + "step": 17436 + }, + { + "epoch": 1.9148912804744125, + "grad_norm": 2.3747782707214355, + "learning_rate": 1e-06, + "loss": 0.9824, + "mean_token_accuracy": 0.6978244781494141, + "num_tokens": 441624418.0, + "step": 17437 + }, + { + "epoch": 1.9150010981770262, + "grad_norm": 2.2969563007354736, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7050188779830933, + "num_tokens": 441650682.0, + "step": 17438 + }, + { + "epoch": 1.9151109158796398, + "grad_norm": 2.303790330886841, + "learning_rate": 1e-06, + "loss": 0.9664, + "mean_token_accuracy": 0.7159898281097412, + "num_tokens": 441677006.0, + "step": 17439 + }, + { + "epoch": 1.9152207335822533, + "grad_norm": 2.4733872413635254, + "learning_rate": 1e-06, + "loss": 0.8665, + "mean_token_accuracy": 0.7317849397659302, + "num_tokens": 441699325.0, + "step": 17440 + }, + { + "epoch": 1.915330551284867, + "grad_norm": 2.169236660003662, + "learning_rate": 1e-06, + "loss": 0.8349, + "mean_token_accuracy": 0.74080491065979, + "num_tokens": 441725432.0, + "step": 17441 + }, + { + "epoch": 1.9154403689874808, + "grad_norm": 2.2658181190490723, + "learning_rate": 1e-06, + "loss": 0.7966, + "mean_token_accuracy": 0.7523072957992554, + "num_tokens": 441749003.0, + "step": 17442 + }, + { + "epoch": 1.9155501866900946, + "grad_norm": 2.0035924911499023, + "learning_rate": 1e-06, + "loss": 0.9375, + "mean_token_accuracy": 0.70906001329422, + "num_tokens": 441781324.0, + "step": 17443 + }, + { + "epoch": 1.9156600043927081, + "grad_norm": 2.3797767162323, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7254701852798462, + "num_tokens": 441805055.0, + "step": 17444 + }, + { + "epoch": 1.9157698220953217, + "grad_norm": 2.214787244796753, + "learning_rate": 1e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.7156758308410645, + "num_tokens": 441831182.0, + "step": 17445 + }, + { + "epoch": 1.9158796397979354, + "grad_norm": 2.4239189624786377, + "learning_rate": 1e-06, + "loss": 0.8433, + "mean_token_accuracy": 0.73610520362854, + "num_tokens": 441852782.0, + "step": 17446 + }, + { + "epoch": 1.9159894575005492, + "grad_norm": 2.2022016048431396, + "learning_rate": 1e-06, + "loss": 0.9286, + "mean_token_accuracy": 0.7079242467880249, + "num_tokens": 441880295.0, + "step": 17447 + }, + { + "epoch": 1.9160992752031627, + "grad_norm": 2.071282386779785, + "learning_rate": 1e-06, + "loss": 0.9552, + "mean_token_accuracy": 0.7027959227561951, + "num_tokens": 441909948.0, + "step": 17448 + }, + { + "epoch": 1.9162090929057765, + "grad_norm": 2.0734524726867676, + "learning_rate": 1e-06, + "loss": 0.8647, + "mean_token_accuracy": 0.7297146320343018, + "num_tokens": 441939221.0, + "step": 17449 + }, + { + "epoch": 1.91631891060839, + "grad_norm": 2.073963165283203, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.7123193740844727, + "num_tokens": 441968512.0, + "step": 17450 + }, + { + "epoch": 1.9164287283110037, + "grad_norm": 2.1418585777282715, + "learning_rate": 1e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.7239525318145752, + "num_tokens": 441997081.0, + "step": 17451 + }, + { + "epoch": 1.9165385460136175, + "grad_norm": 2.3123934268951416, + "learning_rate": 1e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.6986520290374756, + "num_tokens": 442023132.0, + "step": 17452 + }, + { + "epoch": 1.916648363716231, + "grad_norm": 2.2147438526153564, + "learning_rate": 1e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.7162474393844604, + "num_tokens": 442047772.0, + "step": 17453 + }, + { + "epoch": 1.9167581814188446, + "grad_norm": 2.2810025215148926, + "learning_rate": 1e-06, + "loss": 0.8778, + "mean_token_accuracy": 0.7295642495155334, + "num_tokens": 442072424.0, + "step": 17454 + }, + { + "epoch": 1.9168679991214583, + "grad_norm": 2.239865303039551, + "learning_rate": 1e-06, + "loss": 0.8791, + "mean_token_accuracy": 0.7248320579528809, + "num_tokens": 442098678.0, + "step": 17455 + }, + { + "epoch": 1.916977816824072, + "grad_norm": 2.1795003414154053, + "learning_rate": 1e-06, + "loss": 0.9096, + "mean_token_accuracy": 0.7152241468429565, + "num_tokens": 442124706.0, + "step": 17456 + }, + { + "epoch": 1.9170876345266858, + "grad_norm": 2.384422779083252, + "learning_rate": 1e-06, + "loss": 0.8234, + "mean_token_accuracy": 0.7379254102706909, + "num_tokens": 442146680.0, + "step": 17457 + }, + { + "epoch": 1.9171974522292994, + "grad_norm": 2.4378702640533447, + "learning_rate": 1e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.7029780149459839, + "num_tokens": 442172042.0, + "step": 17458 + }, + { + "epoch": 1.917307269931913, + "grad_norm": 2.148383140563965, + "learning_rate": 1e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.7153410911560059, + "num_tokens": 442199717.0, + "step": 17459 + }, + { + "epoch": 1.9174170876345267, + "grad_norm": 1.9638419151306152, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7124337553977966, + "num_tokens": 442231663.0, + "step": 17460 + }, + { + "epoch": 1.9175269053371404, + "grad_norm": 2.5672898292541504, + "learning_rate": 1e-06, + "loss": 0.8366, + "mean_token_accuracy": 0.7477507591247559, + "num_tokens": 442252951.0, + "step": 17461 + }, + { + "epoch": 1.917636723039754, + "grad_norm": 2.1519196033477783, + "learning_rate": 1e-06, + "loss": 0.8795, + "mean_token_accuracy": 0.722854495048523, + "num_tokens": 442280336.0, + "step": 17462 + }, + { + "epoch": 1.9177465407423677, + "grad_norm": 2.2144081592559814, + "learning_rate": 1e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.7310915589332581, + "num_tokens": 442305161.0, + "step": 17463 + }, + { + "epoch": 1.9178563584449813, + "grad_norm": 2.4102351665496826, + "learning_rate": 1e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7092059850692749, + "num_tokens": 442327956.0, + "step": 17464 + }, + { + "epoch": 1.917966176147595, + "grad_norm": 2.124737501144409, + "learning_rate": 1e-06, + "loss": 0.8603, + "mean_token_accuracy": 0.7414052486419678, + "num_tokens": 442353311.0, + "step": 17465 + }, + { + "epoch": 1.9180759938502088, + "grad_norm": 2.1069531440734863, + "learning_rate": 1e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.6984328627586365, + "num_tokens": 442383397.0, + "step": 17466 + }, + { + "epoch": 1.9181858115528223, + "grad_norm": 2.1499571800231934, + "learning_rate": 1e-06, + "loss": 0.8858, + "mean_token_accuracy": 0.7260330319404602, + "num_tokens": 442410429.0, + "step": 17467 + }, + { + "epoch": 1.9182956292554358, + "grad_norm": 2.3252663612365723, + "learning_rate": 1e-06, + "loss": 0.9234, + "mean_token_accuracy": 0.7105834484100342, + "num_tokens": 442436040.0, + "step": 17468 + }, + { + "epoch": 1.9184054469580496, + "grad_norm": 2.3677003383636475, + "learning_rate": 1e-06, + "loss": 0.8704, + "mean_token_accuracy": 0.7272216081619263, + "num_tokens": 442458982.0, + "step": 17469 + }, + { + "epoch": 1.9185152646606634, + "grad_norm": 2.259871006011963, + "learning_rate": 1e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.7135284543037415, + "num_tokens": 442483080.0, + "step": 17470 + }, + { + "epoch": 1.9186250823632771, + "grad_norm": 2.209425687789917, + "learning_rate": 1e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.6947561502456665, + "num_tokens": 442509590.0, + "step": 17471 + }, + { + "epoch": 1.9187349000658906, + "grad_norm": 2.5597574710845947, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7091608047485352, + "num_tokens": 442531598.0, + "step": 17472 + }, + { + "epoch": 1.9188447177685042, + "grad_norm": 2.2684082984924316, + "learning_rate": 1e-06, + "loss": 0.7442, + "mean_token_accuracy": 0.7617185115814209, + "num_tokens": 442553807.0, + "step": 17473 + }, + { + "epoch": 1.918954535471118, + "grad_norm": 2.192570447921753, + "learning_rate": 1e-06, + "loss": 0.8856, + "mean_token_accuracy": 0.718650758266449, + "num_tokens": 442580498.0, + "step": 17474 + }, + { + "epoch": 1.9190643531737317, + "grad_norm": 2.0654826164245605, + "learning_rate": 1e-06, + "loss": 0.8308, + "mean_token_accuracy": 0.7346625328063965, + "num_tokens": 442608110.0, + "step": 17475 + }, + { + "epoch": 1.9191741708763452, + "grad_norm": 1.8933796882629395, + "learning_rate": 1e-06, + "loss": 0.902, + "mean_token_accuracy": 0.7154351472854614, + "num_tokens": 442640465.0, + "step": 17476 + }, + { + "epoch": 1.9192839885789588, + "grad_norm": 2.386589765548706, + "learning_rate": 1e-06, + "loss": 0.8283, + "mean_token_accuracy": 0.737708568572998, + "num_tokens": 442662125.0, + "step": 17477 + }, + { + "epoch": 1.9193938062815725, + "grad_norm": 2.009610891342163, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.6995529532432556, + "num_tokens": 442692809.0, + "step": 17478 + }, + { + "epoch": 1.9195036239841863, + "grad_norm": 1.923972249031067, + "learning_rate": 1e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.7220032215118408, + "num_tokens": 442723426.0, + "step": 17479 + }, + { + "epoch": 1.9196134416868, + "grad_norm": 2.52207612991333, + "learning_rate": 1e-06, + "loss": 0.8128, + "mean_token_accuracy": 0.7377134561538696, + "num_tokens": 442742579.0, + "step": 17480 + }, + { + "epoch": 1.9197232593894136, + "grad_norm": 2.6756231784820557, + "learning_rate": 1e-06, + "loss": 0.7788, + "mean_token_accuracy": 0.7435370683670044, + "num_tokens": 442761369.0, + "step": 17481 + }, + { + "epoch": 1.919833077092027, + "grad_norm": 2.0790789127349854, + "learning_rate": 1e-06, + "loss": 0.927, + "mean_token_accuracy": 0.712281346321106, + "num_tokens": 442790654.0, + "step": 17482 + }, + { + "epoch": 1.9199428947946409, + "grad_norm": 2.336853265762329, + "learning_rate": 1e-06, + "loss": 0.8201, + "mean_token_accuracy": 0.7422643899917603, + "num_tokens": 442812374.0, + "step": 17483 + }, + { + "epoch": 1.9200527124972546, + "grad_norm": 2.1275253295898438, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.7065221667289734, + "num_tokens": 442841101.0, + "step": 17484 + }, + { + "epoch": 1.9201625301998684, + "grad_norm": 2.1201364994049072, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7062740325927734, + "num_tokens": 442868749.0, + "step": 17485 + }, + { + "epoch": 1.920272347902482, + "grad_norm": 2.1801939010620117, + "learning_rate": 1e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.7056652307510376, + "num_tokens": 442897514.0, + "step": 17486 + }, + { + "epoch": 1.9203821656050954, + "grad_norm": 2.483161211013794, + "learning_rate": 1e-06, + "loss": 0.7551, + "mean_token_accuracy": 0.756041944026947, + "num_tokens": 442916090.0, + "step": 17487 + }, + { + "epoch": 1.9204919833077092, + "grad_norm": 2.305506467819214, + "learning_rate": 1e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.7070413827896118, + "num_tokens": 442939747.0, + "step": 17488 + }, + { + "epoch": 1.920601801010323, + "grad_norm": 2.017421007156372, + "learning_rate": 1e-06, + "loss": 0.8815, + "mean_token_accuracy": 0.7233867645263672, + "num_tokens": 442972235.0, + "step": 17489 + }, + { + "epoch": 1.9207116187129365, + "grad_norm": 2.536054849624634, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7148219347000122, + "num_tokens": 442996507.0, + "step": 17490 + }, + { + "epoch": 1.92082143641555, + "grad_norm": 2.412740707397461, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7077635526657104, + "num_tokens": 443021552.0, + "step": 17491 + }, + { + "epoch": 1.9209312541181638, + "grad_norm": 2.528824806213379, + "learning_rate": 1e-06, + "loss": 0.8852, + "mean_token_accuracy": 0.7244601249694824, + "num_tokens": 443041605.0, + "step": 17492 + }, + { + "epoch": 1.9210410718207775, + "grad_norm": 2.1468029022216797, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.71669602394104, + "num_tokens": 443071560.0, + "step": 17493 + }, + { + "epoch": 1.9211508895233913, + "grad_norm": 2.612204074859619, + "learning_rate": 1e-06, + "loss": 0.7971, + "mean_token_accuracy": 0.7476256489753723, + "num_tokens": 443092263.0, + "step": 17494 + }, + { + "epoch": 1.9212607072260048, + "grad_norm": 2.2152202129364014, + "learning_rate": 1e-06, + "loss": 0.8732, + "mean_token_accuracy": 0.729291558265686, + "num_tokens": 443117561.0, + "step": 17495 + }, + { + "epoch": 1.9213705249286184, + "grad_norm": 2.116114854812622, + "learning_rate": 1e-06, + "loss": 0.9039, + "mean_token_accuracy": 0.7222031950950623, + "num_tokens": 443149111.0, + "step": 17496 + }, + { + "epoch": 1.9214803426312321, + "grad_norm": 2.07555890083313, + "learning_rate": 1e-06, + "loss": 0.7834, + "mean_token_accuracy": 0.7520201802253723, + "num_tokens": 443177451.0, + "step": 17497 + }, + { + "epoch": 1.9215901603338459, + "grad_norm": 2.053239107131958, + "learning_rate": 1e-06, + "loss": 0.905, + "mean_token_accuracy": 0.7211195230484009, + "num_tokens": 443206694.0, + "step": 17498 + }, + { + "epoch": 1.9216999780364594, + "grad_norm": 2.2122068405151367, + "learning_rate": 1e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7094441652297974, + "num_tokens": 443234395.0, + "step": 17499 + }, + { + "epoch": 1.9218097957390732, + "grad_norm": 2.256352424621582, + "learning_rate": 1e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.6973801851272583, + "num_tokens": 443261577.0, + "step": 17500 + }, + { + "epoch": 1.9219196134416867, + "grad_norm": 2.294013500213623, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.701947808265686, + "num_tokens": 443286875.0, + "step": 17501 + }, + { + "epoch": 1.9220294311443005, + "grad_norm": 2.6196279525756836, + "learning_rate": 1e-06, + "loss": 0.9474, + "mean_token_accuracy": 0.7122796773910522, + "num_tokens": 443306746.0, + "step": 17502 + }, + { + "epoch": 1.9221392488469142, + "grad_norm": 2.4515490531921387, + "learning_rate": 1e-06, + "loss": 0.8502, + "mean_token_accuracy": 0.73589026927948, + "num_tokens": 443329399.0, + "step": 17503 + }, + { + "epoch": 1.9222490665495278, + "grad_norm": 2.2137720584869385, + "learning_rate": 1e-06, + "loss": 0.9836, + "mean_token_accuracy": 0.7089152336120605, + "num_tokens": 443356684.0, + "step": 17504 + }, + { + "epoch": 1.9223588842521413, + "grad_norm": 2.3616271018981934, + "learning_rate": 1e-06, + "loss": 0.8495, + "mean_token_accuracy": 0.7340092062950134, + "num_tokens": 443379208.0, + "step": 17505 + }, + { + "epoch": 1.922468701954755, + "grad_norm": 2.0853371620178223, + "learning_rate": 1e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.6902580261230469, + "num_tokens": 443409724.0, + "step": 17506 + }, + { + "epoch": 1.9225785196573688, + "grad_norm": 2.4068186283111572, + "learning_rate": 1e-06, + "loss": 0.86, + "mean_token_accuracy": 0.726914644241333, + "num_tokens": 443430971.0, + "step": 17507 + }, + { + "epoch": 1.9226883373599826, + "grad_norm": 2.5829145908355713, + "learning_rate": 1e-06, + "loss": 0.7785, + "mean_token_accuracy": 0.7582106590270996, + "num_tokens": 443449572.0, + "step": 17508 + }, + { + "epoch": 1.922798155062596, + "grad_norm": 2.6260786056518555, + "learning_rate": 1e-06, + "loss": 0.8487, + "mean_token_accuracy": 0.725254476070404, + "num_tokens": 443469735.0, + "step": 17509 + }, + { + "epoch": 1.9229079727652096, + "grad_norm": 2.165984630584717, + "learning_rate": 1e-06, + "loss": 0.883, + "mean_token_accuracy": 0.7218550443649292, + "num_tokens": 443497182.0, + "step": 17510 + }, + { + "epoch": 1.9230177904678234, + "grad_norm": 2.5791101455688477, + "learning_rate": 1e-06, + "loss": 0.84, + "mean_token_accuracy": 0.7435416579246521, + "num_tokens": 443518359.0, + "step": 17511 + }, + { + "epoch": 1.9231276081704372, + "grad_norm": 1.9544075727462769, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.7041122317314148, + "num_tokens": 443551852.0, + "step": 17512 + }, + { + "epoch": 1.9232374258730507, + "grad_norm": 2.0784881114959717, + "learning_rate": 1e-06, + "loss": 0.9105, + "mean_token_accuracy": 0.7126296162605286, + "num_tokens": 443581085.0, + "step": 17513 + }, + { + "epoch": 1.9233472435756644, + "grad_norm": 2.386355400085449, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7141560316085815, + "num_tokens": 443604439.0, + "step": 17514 + }, + { + "epoch": 1.923457061278278, + "grad_norm": 2.2767391204833984, + "learning_rate": 1e-06, + "loss": 1.0219, + "mean_token_accuracy": 0.6889902353286743, + "num_tokens": 443631348.0, + "step": 17515 + }, + { + "epoch": 1.9235668789808917, + "grad_norm": 2.312560796737671, + "learning_rate": 1e-06, + "loss": 0.8159, + "mean_token_accuracy": 0.7371715903282166, + "num_tokens": 443654886.0, + "step": 17516 + }, + { + "epoch": 1.9236766966835055, + "grad_norm": 2.0308759212493896, + "learning_rate": 1e-06, + "loss": 0.8694, + "mean_token_accuracy": 0.7291712164878845, + "num_tokens": 443684999.0, + "step": 17517 + }, + { + "epoch": 1.923786514386119, + "grad_norm": 2.7146384716033936, + "learning_rate": 1e-06, + "loss": 0.8497, + "mean_token_accuracy": 0.7266864776611328, + "num_tokens": 443703339.0, + "step": 17518 + }, + { + "epoch": 1.9238963320887326, + "grad_norm": 2.212250232696533, + "learning_rate": 1e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.7081449031829834, + "num_tokens": 443728971.0, + "step": 17519 + }, + { + "epoch": 1.9240061497913463, + "grad_norm": 2.55220627784729, + "learning_rate": 1e-06, + "loss": 0.8361, + "mean_token_accuracy": 0.7312620878219604, + "num_tokens": 443749514.0, + "step": 17520 + }, + { + "epoch": 1.92411596749396, + "grad_norm": 2.204800605773926, + "learning_rate": 1e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.7096785306930542, + "num_tokens": 443775948.0, + "step": 17521 + }, + { + "epoch": 1.9242257851965738, + "grad_norm": 2.223926067352295, + "learning_rate": 1e-06, + "loss": 0.9524, + "mean_token_accuracy": 0.7042784690856934, + "num_tokens": 443803852.0, + "step": 17522 + }, + { + "epoch": 1.9243356028991874, + "grad_norm": 2.305690288543701, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7209234237670898, + "num_tokens": 443828323.0, + "step": 17523 + }, + { + "epoch": 1.924445420601801, + "grad_norm": 2.257218837738037, + "learning_rate": 1e-06, + "loss": 0.939, + "mean_token_accuracy": 0.7085371017456055, + "num_tokens": 443854071.0, + "step": 17524 + }, + { + "epoch": 1.9245552383044147, + "grad_norm": 2.2665727138519287, + "learning_rate": 1e-06, + "loss": 0.8542, + "mean_token_accuracy": 0.7257179617881775, + "num_tokens": 443879344.0, + "step": 17525 + }, + { + "epoch": 1.9246650560070284, + "grad_norm": 2.3391036987304688, + "learning_rate": 1e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.7347692251205444, + "num_tokens": 443902835.0, + "step": 17526 + }, + { + "epoch": 1.924774873709642, + "grad_norm": 2.6417477130889893, + "learning_rate": 1e-06, + "loss": 0.9262, + "mean_token_accuracy": 0.7186610698699951, + "num_tokens": 443922464.0, + "step": 17527 + }, + { + "epoch": 1.9248846914122557, + "grad_norm": 2.407435894012451, + "learning_rate": 1e-06, + "loss": 0.8669, + "mean_token_accuracy": 0.7254321575164795, + "num_tokens": 443946127.0, + "step": 17528 + }, + { + "epoch": 1.9249945091148692, + "grad_norm": 2.263976573944092, + "learning_rate": 1e-06, + "loss": 0.8747, + "mean_token_accuracy": 0.7244177460670471, + "num_tokens": 443972277.0, + "step": 17529 + }, + { + "epoch": 1.925104326817483, + "grad_norm": 2.627251625061035, + "learning_rate": 1e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.713707685470581, + "num_tokens": 443992521.0, + "step": 17530 + }, + { + "epoch": 1.9252141445200968, + "grad_norm": 2.3425657749176025, + "learning_rate": 1e-06, + "loss": 0.8881, + "mean_token_accuracy": 0.7205877304077148, + "num_tokens": 444017007.0, + "step": 17531 + }, + { + "epoch": 1.9253239622227103, + "grad_norm": 2.3078792095184326, + "learning_rate": 1e-06, + "loss": 0.8176, + "mean_token_accuracy": 0.752314567565918, + "num_tokens": 444042917.0, + "step": 17532 + }, + { + "epoch": 1.9254337799253238, + "grad_norm": 1.8959747552871704, + "learning_rate": 1e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.7056005597114563, + "num_tokens": 444074863.0, + "step": 17533 + }, + { + "epoch": 1.9255435976279376, + "grad_norm": 2.4216597080230713, + "learning_rate": 1e-06, + "loss": 0.8062, + "mean_token_accuracy": 0.7374849915504456, + "num_tokens": 444096019.0, + "step": 17534 + }, + { + "epoch": 1.9256534153305513, + "grad_norm": 2.310926914215088, + "learning_rate": 1e-06, + "loss": 0.9225, + "mean_token_accuracy": 0.7078245282173157, + "num_tokens": 444122115.0, + "step": 17535 + }, + { + "epoch": 1.925763233033165, + "grad_norm": 2.182459592819214, + "learning_rate": 1e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7064874768257141, + "num_tokens": 444148848.0, + "step": 17536 + }, + { + "epoch": 1.9258730507357786, + "grad_norm": 2.171271324157715, + "learning_rate": 1e-06, + "loss": 0.9016, + "mean_token_accuracy": 0.722486138343811, + "num_tokens": 444176659.0, + "step": 17537 + }, + { + "epoch": 1.9259828684383922, + "grad_norm": 2.0928244590759277, + "learning_rate": 1e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.7011507749557495, + "num_tokens": 444206416.0, + "step": 17538 + }, + { + "epoch": 1.926092686141006, + "grad_norm": 2.155024766921997, + "learning_rate": 1e-06, + "loss": 0.8276, + "mean_token_accuracy": 0.7351988554000854, + "num_tokens": 444232546.0, + "step": 17539 + }, + { + "epoch": 1.9262025038436197, + "grad_norm": 2.2342607975006104, + "learning_rate": 1e-06, + "loss": 0.8628, + "mean_token_accuracy": 0.7288177013397217, + "num_tokens": 444259041.0, + "step": 17540 + }, + { + "epoch": 1.9263123215462332, + "grad_norm": 2.301259994506836, + "learning_rate": 1e-06, + "loss": 0.863, + "mean_token_accuracy": 0.7257480025291443, + "num_tokens": 444282100.0, + "step": 17541 + }, + { + "epoch": 1.9264221392488468, + "grad_norm": 2.3614871501922607, + "learning_rate": 1e-06, + "loss": 0.8165, + "mean_token_accuracy": 0.7399308085441589, + "num_tokens": 444307601.0, + "step": 17542 + }, + { + "epoch": 1.9265319569514605, + "grad_norm": 2.1873159408569336, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.7001839876174927, + "num_tokens": 444338179.0, + "step": 17543 + }, + { + "epoch": 1.9266417746540743, + "grad_norm": 2.4651360511779785, + "learning_rate": 1e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.7136474251747131, + "num_tokens": 444360228.0, + "step": 17544 + }, + { + "epoch": 1.926751592356688, + "grad_norm": 2.4498379230499268, + "learning_rate": 1e-06, + "loss": 0.823, + "mean_token_accuracy": 0.7354873418807983, + "num_tokens": 444381632.0, + "step": 17545 + }, + { + "epoch": 1.9268614100593016, + "grad_norm": 2.0361876487731934, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.7133486270904541, + "num_tokens": 444411499.0, + "step": 17546 + }, + { + "epoch": 1.926971227761915, + "grad_norm": 2.4198567867279053, + "learning_rate": 1e-06, + "loss": 0.8573, + "mean_token_accuracy": 0.7380320429801941, + "num_tokens": 444434220.0, + "step": 17547 + }, + { + "epoch": 1.9270810454645289, + "grad_norm": 2.0612597465515137, + "learning_rate": 1e-06, + "loss": 1.0136, + "mean_token_accuracy": 0.6918861865997314, + "num_tokens": 444464850.0, + "step": 17548 + }, + { + "epoch": 1.9271908631671426, + "grad_norm": 2.2686054706573486, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7016297578811646, + "num_tokens": 444491296.0, + "step": 17549 + }, + { + "epoch": 1.9273006808697564, + "grad_norm": 2.2752201557159424, + "learning_rate": 1e-06, + "loss": 0.8925, + "mean_token_accuracy": 0.718234121799469, + "num_tokens": 444516786.0, + "step": 17550 + }, + { + "epoch": 1.92741049857237, + "grad_norm": 2.010624885559082, + "learning_rate": 1e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.7233314514160156, + "num_tokens": 444548115.0, + "step": 17551 + }, + { + "epoch": 1.9275203162749834, + "grad_norm": 2.2119569778442383, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.6959629058837891, + "num_tokens": 444575297.0, + "step": 17552 + }, + { + "epoch": 1.9276301339775972, + "grad_norm": 2.1814041137695312, + "learning_rate": 1e-06, + "loss": 0.8642, + "mean_token_accuracy": 0.7253468632698059, + "num_tokens": 444602392.0, + "step": 17553 + }, + { + "epoch": 1.927739951680211, + "grad_norm": 2.3695309162139893, + "learning_rate": 1e-06, + "loss": 0.847, + "mean_token_accuracy": 0.728872537612915, + "num_tokens": 444624058.0, + "step": 17554 + }, + { + "epoch": 1.9278497693828245, + "grad_norm": 2.1420583724975586, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.7059450149536133, + "num_tokens": 444650658.0, + "step": 17555 + }, + { + "epoch": 1.927959587085438, + "grad_norm": 2.374516487121582, + "learning_rate": 1e-06, + "loss": 0.8927, + "mean_token_accuracy": 0.7185202240943909, + "num_tokens": 444675174.0, + "step": 17556 + }, + { + "epoch": 1.9280694047880518, + "grad_norm": 2.1973793506622314, + "learning_rate": 1e-06, + "loss": 0.9704, + "mean_token_accuracy": 0.7051549553871155, + "num_tokens": 444701297.0, + "step": 17557 + }, + { + "epoch": 1.9281792224906655, + "grad_norm": 2.243964910507202, + "learning_rate": 1e-06, + "loss": 0.8611, + "mean_token_accuracy": 0.7298710346221924, + "num_tokens": 444727529.0, + "step": 17558 + }, + { + "epoch": 1.9282890401932793, + "grad_norm": 2.1833157539367676, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7100352644920349, + "num_tokens": 444754488.0, + "step": 17559 + }, + { + "epoch": 1.9283988578958928, + "grad_norm": 2.5751874446868896, + "learning_rate": 1e-06, + "loss": 0.8444, + "mean_token_accuracy": 0.7368850708007812, + "num_tokens": 444773926.0, + "step": 17560 + }, + { + "epoch": 1.9285086755985064, + "grad_norm": 2.065586566925049, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.6975307464599609, + "num_tokens": 444805934.0, + "step": 17561 + }, + { + "epoch": 1.9286184933011201, + "grad_norm": 2.024038076400757, + "learning_rate": 1e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.6895709037780762, + "num_tokens": 444835718.0, + "step": 17562 + }, + { + "epoch": 1.9287283110037339, + "grad_norm": 2.264787197113037, + "learning_rate": 1e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.7100881338119507, + "num_tokens": 444863120.0, + "step": 17563 + }, + { + "epoch": 1.9288381287063474, + "grad_norm": 2.4821865558624268, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7222979068756104, + "num_tokens": 444887191.0, + "step": 17564 + }, + { + "epoch": 1.9289479464089612, + "grad_norm": 2.0993709564208984, + "learning_rate": 1e-06, + "loss": 0.8353, + "mean_token_accuracy": 0.7330778241157532, + "num_tokens": 444914492.0, + "step": 17565 + }, + { + "epoch": 1.9290577641115747, + "grad_norm": 2.2332375049591064, + "learning_rate": 1e-06, + "loss": 0.8809, + "mean_token_accuracy": 0.7161476612091064, + "num_tokens": 444937839.0, + "step": 17566 + }, + { + "epoch": 1.9291675818141885, + "grad_norm": 2.239680051803589, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7049079537391663, + "num_tokens": 444965506.0, + "step": 17567 + }, + { + "epoch": 1.9292773995168022, + "grad_norm": 2.4019086360931396, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7162078619003296, + "num_tokens": 444986697.0, + "step": 17568 + }, + { + "epoch": 1.9293872172194158, + "grad_norm": 2.380159854888916, + "learning_rate": 1e-06, + "loss": 0.9032, + "mean_token_accuracy": 0.7183812856674194, + "num_tokens": 445008252.0, + "step": 17569 + }, + { + "epoch": 1.9294970349220293, + "grad_norm": 2.345254898071289, + "learning_rate": 1e-06, + "loss": 0.9811, + "mean_token_accuracy": 0.7055244445800781, + "num_tokens": 445033505.0, + "step": 17570 + }, + { + "epoch": 1.929606852624643, + "grad_norm": 2.362999200820923, + "learning_rate": 1e-06, + "loss": 0.8838, + "mean_token_accuracy": 0.7231531739234924, + "num_tokens": 445057867.0, + "step": 17571 + }, + { + "epoch": 1.9297166703272568, + "grad_norm": 2.316352128982544, + "learning_rate": 1e-06, + "loss": 0.9776, + "mean_token_accuracy": 0.700564444065094, + "num_tokens": 445083757.0, + "step": 17572 + }, + { + "epoch": 1.9298264880298706, + "grad_norm": 2.3950371742248535, + "learning_rate": 1e-06, + "loss": 0.8616, + "mean_token_accuracy": 0.7304383516311646, + "num_tokens": 445106054.0, + "step": 17573 + }, + { + "epoch": 1.929936305732484, + "grad_norm": 2.625500440597534, + "learning_rate": 1e-06, + "loss": 0.8157, + "mean_token_accuracy": 0.74249666929245, + "num_tokens": 445125958.0, + "step": 17574 + }, + { + "epoch": 1.9300461234350976, + "grad_norm": 2.4625258445739746, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7106291055679321, + "num_tokens": 445148128.0, + "step": 17575 + }, + { + "epoch": 1.9301559411377114, + "grad_norm": 2.452049732208252, + "learning_rate": 1e-06, + "loss": 0.8501, + "mean_token_accuracy": 0.7292983531951904, + "num_tokens": 445169019.0, + "step": 17576 + }, + { + "epoch": 1.9302657588403251, + "grad_norm": 2.5955569744110107, + "learning_rate": 1e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.7240645289421082, + "num_tokens": 445189903.0, + "step": 17577 + }, + { + "epoch": 1.9303755765429387, + "grad_norm": 2.2416303157806396, + "learning_rate": 1e-06, + "loss": 0.8527, + "mean_token_accuracy": 0.7334407567977905, + "num_tokens": 445215958.0, + "step": 17578 + }, + { + "epoch": 1.9304853942455524, + "grad_norm": 2.140397787094116, + "learning_rate": 1e-06, + "loss": 0.8966, + "mean_token_accuracy": 0.7197636365890503, + "num_tokens": 445241750.0, + "step": 17579 + }, + { + "epoch": 1.930595211948166, + "grad_norm": 2.5613136291503906, + "learning_rate": 1e-06, + "loss": 0.904, + "mean_token_accuracy": 0.721473217010498, + "num_tokens": 445262449.0, + "step": 17580 + }, + { + "epoch": 1.9307050296507797, + "grad_norm": 2.1485531330108643, + "learning_rate": 1e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.7062170505523682, + "num_tokens": 445290759.0, + "step": 17581 + }, + { + "epoch": 1.9308148473533935, + "grad_norm": 2.3286986351013184, + "learning_rate": 1e-06, + "loss": 0.8597, + "mean_token_accuracy": 0.7279032468795776, + "num_tokens": 445314927.0, + "step": 17582 + }, + { + "epoch": 1.930924665056007, + "grad_norm": 2.3448984622955322, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.7111608386039734, + "num_tokens": 445339162.0, + "step": 17583 + }, + { + "epoch": 1.9310344827586206, + "grad_norm": 2.5679006576538086, + "learning_rate": 1e-06, + "loss": 0.8602, + "mean_token_accuracy": 0.7306488752365112, + "num_tokens": 445360110.0, + "step": 17584 + }, + { + "epoch": 1.9311443004612343, + "grad_norm": 2.380965232849121, + "learning_rate": 1e-06, + "loss": 0.991, + "mean_token_accuracy": 0.6894238591194153, + "num_tokens": 445384676.0, + "step": 17585 + }, + { + "epoch": 1.931254118163848, + "grad_norm": 2.1643354892730713, + "learning_rate": 1e-06, + "loss": 0.8259, + "mean_token_accuracy": 0.7405415177345276, + "num_tokens": 445410783.0, + "step": 17586 + }, + { + "epoch": 1.9313639358664618, + "grad_norm": 2.1853227615356445, + "learning_rate": 1e-06, + "loss": 0.9047, + "mean_token_accuracy": 0.7246360182762146, + "num_tokens": 445435256.0, + "step": 17587 + }, + { + "epoch": 1.9314737535690754, + "grad_norm": 2.2017340660095215, + "learning_rate": 1e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.7168677449226379, + "num_tokens": 445462772.0, + "step": 17588 + }, + { + "epoch": 1.931583571271689, + "grad_norm": 2.5834596157073975, + "learning_rate": 1e-06, + "loss": 0.8709, + "mean_token_accuracy": 0.7328131794929504, + "num_tokens": 445483357.0, + "step": 17589 + }, + { + "epoch": 1.9316933889743026, + "grad_norm": 2.30979323387146, + "learning_rate": 1e-06, + "loss": 0.8798, + "mean_token_accuracy": 0.7250977754592896, + "num_tokens": 445508180.0, + "step": 17590 + }, + { + "epoch": 1.9318032066769164, + "grad_norm": 2.271660804748535, + "learning_rate": 1e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.7148956656455994, + "num_tokens": 445534040.0, + "step": 17591 + }, + { + "epoch": 1.93191302437953, + "grad_norm": 2.0512373447418213, + "learning_rate": 1e-06, + "loss": 0.8559, + "mean_token_accuracy": 0.7259635329246521, + "num_tokens": 445564734.0, + "step": 17592 + }, + { + "epoch": 1.9320228420821435, + "grad_norm": 2.1425764560699463, + "learning_rate": 1e-06, + "loss": 0.876, + "mean_token_accuracy": 0.7234295606613159, + "num_tokens": 445591067.0, + "step": 17593 + }, + { + "epoch": 1.9321326597847572, + "grad_norm": 2.5954949855804443, + "learning_rate": 1e-06, + "loss": 0.9038, + "mean_token_accuracy": 0.7239137291908264, + "num_tokens": 445611201.0, + "step": 17594 + }, + { + "epoch": 1.932242477487371, + "grad_norm": 2.143825054168701, + "learning_rate": 1e-06, + "loss": 0.9189, + "mean_token_accuracy": 0.7171589136123657, + "num_tokens": 445636586.0, + "step": 17595 + }, + { + "epoch": 1.9323522951899847, + "grad_norm": 1.9941799640655518, + "learning_rate": 1e-06, + "loss": 0.7993, + "mean_token_accuracy": 0.7447752356529236, + "num_tokens": 445665977.0, + "step": 17596 + }, + { + "epoch": 1.9324621128925983, + "grad_norm": 2.2295567989349365, + "learning_rate": 1e-06, + "loss": 0.865, + "mean_token_accuracy": 0.7355176210403442, + "num_tokens": 445692835.0, + "step": 17597 + }, + { + "epoch": 1.9325719305952118, + "grad_norm": 2.338937282562256, + "learning_rate": 1e-06, + "loss": 0.7831, + "mean_token_accuracy": 0.751695454120636, + "num_tokens": 445715801.0, + "step": 17598 + }, + { + "epoch": 1.9326817482978256, + "grad_norm": 1.9709045886993408, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7147406339645386, + "num_tokens": 445748442.0, + "step": 17599 + }, + { + "epoch": 1.9327915660004393, + "grad_norm": 2.0269672870635986, + "learning_rate": 1e-06, + "loss": 0.8814, + "mean_token_accuracy": 0.7181859016418457, + "num_tokens": 445777354.0, + "step": 17600 + }, + { + "epoch": 1.932901383703053, + "grad_norm": 2.4627246856689453, + "learning_rate": 1e-06, + "loss": 0.8831, + "mean_token_accuracy": 0.7362248301506042, + "num_tokens": 445799011.0, + "step": 17601 + }, + { + "epoch": 1.9330112014056666, + "grad_norm": 2.264784336090088, + "learning_rate": 1e-06, + "loss": 0.8841, + "mean_token_accuracy": 0.7278637886047363, + "num_tokens": 445824433.0, + "step": 17602 + }, + { + "epoch": 1.9331210191082802, + "grad_norm": 2.1420185565948486, + "learning_rate": 1e-06, + "loss": 0.8389, + "mean_token_accuracy": 0.7343497276306152, + "num_tokens": 445852910.0, + "step": 17603 + }, + { + "epoch": 1.933230836810894, + "grad_norm": 2.4309279918670654, + "learning_rate": 1e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7123499512672424, + "num_tokens": 445876944.0, + "step": 17604 + }, + { + "epoch": 1.9333406545135077, + "grad_norm": 1.9749293327331543, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7061086297035217, + "num_tokens": 445909722.0, + "step": 17605 + }, + { + "epoch": 1.9334504722161212, + "grad_norm": 2.349452495574951, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7046825885772705, + "num_tokens": 445933920.0, + "step": 17606 + }, + { + "epoch": 1.9335602899187347, + "grad_norm": 2.239180326461792, + "learning_rate": 1e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.7006877660751343, + "num_tokens": 445962166.0, + "step": 17607 + }, + { + "epoch": 1.9336701076213485, + "grad_norm": 2.591029405593872, + "learning_rate": 1e-06, + "loss": 0.8408, + "mean_token_accuracy": 0.7332240343093872, + "num_tokens": 445983150.0, + "step": 17608 + }, + { + "epoch": 1.9337799253239623, + "grad_norm": 2.207015037536621, + "learning_rate": 1e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.715370774269104, + "num_tokens": 446010373.0, + "step": 17609 + }, + { + "epoch": 1.933889743026576, + "grad_norm": 2.1283726692199707, + "learning_rate": 1e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.7098852396011353, + "num_tokens": 446040276.0, + "step": 17610 + }, + { + "epoch": 1.9339995607291895, + "grad_norm": 2.2688019275665283, + "learning_rate": 1e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.7103557586669922, + "num_tokens": 446067408.0, + "step": 17611 + }, + { + "epoch": 1.934109378431803, + "grad_norm": 2.37570858001709, + "learning_rate": 1e-06, + "loss": 0.8671, + "mean_token_accuracy": 0.7278691530227661, + "num_tokens": 446089260.0, + "step": 17612 + }, + { + "epoch": 1.9342191961344168, + "grad_norm": 2.3737587928771973, + "learning_rate": 1e-06, + "loss": 0.8475, + "mean_token_accuracy": 0.7290313243865967, + "num_tokens": 446110801.0, + "step": 17613 + }, + { + "epoch": 1.9343290138370306, + "grad_norm": 2.474306344985962, + "learning_rate": 1e-06, + "loss": 0.8285, + "mean_token_accuracy": 0.7372473478317261, + "num_tokens": 446133957.0, + "step": 17614 + }, + { + "epoch": 1.9344388315396444, + "grad_norm": 2.249110698699951, + "learning_rate": 1e-06, + "loss": 0.7969, + "mean_token_accuracy": 0.7480496764183044, + "num_tokens": 446159023.0, + "step": 17615 + }, + { + "epoch": 1.934548649242258, + "grad_norm": 2.4092230796813965, + "learning_rate": 1e-06, + "loss": 0.829, + "mean_token_accuracy": 0.7408075332641602, + "num_tokens": 446181031.0, + "step": 17616 + }, + { + "epoch": 1.9346584669448714, + "grad_norm": 2.083606004714966, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7102646827697754, + "num_tokens": 446209084.0, + "step": 17617 + }, + { + "epoch": 1.9347682846474852, + "grad_norm": 2.2381160259246826, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7125940322875977, + "num_tokens": 446236002.0, + "step": 17618 + }, + { + "epoch": 1.934878102350099, + "grad_norm": 2.2360613346099854, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.718481183052063, + "num_tokens": 446262753.0, + "step": 17619 + }, + { + "epoch": 1.9349879200527125, + "grad_norm": 2.4482390880584717, + "learning_rate": 1e-06, + "loss": 0.8306, + "mean_token_accuracy": 0.7318666577339172, + "num_tokens": 446283746.0, + "step": 17620 + }, + { + "epoch": 1.935097737755326, + "grad_norm": 2.304790735244751, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7105998396873474, + "num_tokens": 446310319.0, + "step": 17621 + }, + { + "epoch": 1.9352075554579398, + "grad_norm": 2.142228603363037, + "learning_rate": 1e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.701606273651123, + "num_tokens": 446338730.0, + "step": 17622 + }, + { + "epoch": 1.9353173731605535, + "grad_norm": 2.4481260776519775, + "learning_rate": 1e-06, + "loss": 0.8668, + "mean_token_accuracy": 0.7224974036216736, + "num_tokens": 446359387.0, + "step": 17623 + }, + { + "epoch": 1.9354271908631673, + "grad_norm": 2.597841262817383, + "learning_rate": 1e-06, + "loss": 0.8396, + "mean_token_accuracy": 0.7324926257133484, + "num_tokens": 446379170.0, + "step": 17624 + }, + { + "epoch": 1.9355370085657808, + "grad_norm": 2.1296255588531494, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.71697998046875, + "num_tokens": 446406652.0, + "step": 17625 + }, + { + "epoch": 1.9356468262683943, + "grad_norm": 2.253478527069092, + "learning_rate": 1e-06, + "loss": 0.8511, + "mean_token_accuracy": 0.7350560426712036, + "num_tokens": 446430827.0, + "step": 17626 + }, + { + "epoch": 1.935756643971008, + "grad_norm": 2.221392869949341, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7015926837921143, + "num_tokens": 446457376.0, + "step": 17627 + }, + { + "epoch": 1.9358664616736219, + "grad_norm": 2.1666314601898193, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7143158912658691, + "num_tokens": 446484136.0, + "step": 17628 + }, + { + "epoch": 1.9359762793762354, + "grad_norm": 2.157050371170044, + "learning_rate": 1e-06, + "loss": 0.8704, + "mean_token_accuracy": 0.7292936444282532, + "num_tokens": 446509784.0, + "step": 17629 + }, + { + "epoch": 1.9360860970788492, + "grad_norm": 2.3672125339508057, + "learning_rate": 1e-06, + "loss": 0.8626, + "mean_token_accuracy": 0.7273361682891846, + "num_tokens": 446534042.0, + "step": 17630 + }, + { + "epoch": 1.9361959147814627, + "grad_norm": 2.014435052871704, + "learning_rate": 1e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.699640154838562, + "num_tokens": 446565942.0, + "step": 17631 + }, + { + "epoch": 1.9363057324840764, + "grad_norm": 2.524606466293335, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7126643061637878, + "num_tokens": 446587957.0, + "step": 17632 + }, + { + "epoch": 1.9364155501866902, + "grad_norm": 2.2381296157836914, + "learning_rate": 1e-06, + "loss": 0.7533, + "mean_token_accuracy": 0.7622406482696533, + "num_tokens": 446610766.0, + "step": 17633 + }, + { + "epoch": 1.9365253678893037, + "grad_norm": 2.0046114921569824, + "learning_rate": 1e-06, + "loss": 0.8788, + "mean_token_accuracy": 0.7239918112754822, + "num_tokens": 446643567.0, + "step": 17634 + }, + { + "epoch": 1.9366351855919173, + "grad_norm": 1.8595131635665894, + "learning_rate": 1e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.7137974500656128, + "num_tokens": 446675308.0, + "step": 17635 + }, + { + "epoch": 1.936745003294531, + "grad_norm": 2.099811553955078, + "learning_rate": 1e-06, + "loss": 0.8975, + "mean_token_accuracy": 0.7171390056610107, + "num_tokens": 446701707.0, + "step": 17636 + }, + { + "epoch": 1.9368548209971448, + "grad_norm": 2.339853286743164, + "learning_rate": 1e-06, + "loss": 0.8914, + "mean_token_accuracy": 0.7176236510276794, + "num_tokens": 446724021.0, + "step": 17637 + }, + { + "epoch": 1.9369646386997585, + "grad_norm": 2.5647284984588623, + "learning_rate": 1e-06, + "loss": 0.7969, + "mean_token_accuracy": 0.7472019195556641, + "num_tokens": 446743074.0, + "step": 17638 + }, + { + "epoch": 1.937074456402372, + "grad_norm": 2.052696466445923, + "learning_rate": 1e-06, + "loss": 0.8361, + "mean_token_accuracy": 0.7387420535087585, + "num_tokens": 446769065.0, + "step": 17639 + }, + { + "epoch": 1.9371842741049856, + "grad_norm": 2.3011558055877686, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7062387466430664, + "num_tokens": 446794932.0, + "step": 17640 + }, + { + "epoch": 1.9372940918075994, + "grad_norm": 2.251110553741455, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.6975911855697632, + "num_tokens": 446822377.0, + "step": 17641 + }, + { + "epoch": 1.9374039095102131, + "grad_norm": 2.1136207580566406, + "learning_rate": 1e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7109615802764893, + "num_tokens": 446848530.0, + "step": 17642 + }, + { + "epoch": 1.9375137272128267, + "grad_norm": 2.4063057899475098, + "learning_rate": 1e-06, + "loss": 0.9679, + "mean_token_accuracy": 0.7020166516304016, + "num_tokens": 446872127.0, + "step": 17643 + }, + { + "epoch": 1.9376235449154404, + "grad_norm": 2.1567583084106445, + "learning_rate": 1e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.7154147624969482, + "num_tokens": 446899534.0, + "step": 17644 + }, + { + "epoch": 1.937733362618054, + "grad_norm": 2.2330610752105713, + "learning_rate": 1e-06, + "loss": 0.8698, + "mean_token_accuracy": 0.7294826507568359, + "num_tokens": 446924140.0, + "step": 17645 + }, + { + "epoch": 1.9378431803206677, + "grad_norm": 1.8946342468261719, + "learning_rate": 1e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.7128616571426392, + "num_tokens": 446957246.0, + "step": 17646 + }, + { + "epoch": 1.9379529980232815, + "grad_norm": 2.1716115474700928, + "learning_rate": 1e-06, + "loss": 0.8738, + "mean_token_accuracy": 0.7317245006561279, + "num_tokens": 446984096.0, + "step": 17647 + }, + { + "epoch": 1.938062815725895, + "grad_norm": 2.2509713172912598, + "learning_rate": 1e-06, + "loss": 0.8812, + "mean_token_accuracy": 0.7208676338195801, + "num_tokens": 447009993.0, + "step": 17648 + }, + { + "epoch": 1.9381726334285085, + "grad_norm": 2.305814027786255, + "learning_rate": 1e-06, + "loss": 0.897, + "mean_token_accuracy": 0.7210624814033508, + "num_tokens": 447033299.0, + "step": 17649 + }, + { + "epoch": 1.9382824511311223, + "grad_norm": 2.2738475799560547, + "learning_rate": 1e-06, + "loss": 0.8465, + "mean_token_accuracy": 0.7358001470565796, + "num_tokens": 447057346.0, + "step": 17650 + }, + { + "epoch": 1.938392268833736, + "grad_norm": 2.31913161277771, + "learning_rate": 1e-06, + "loss": 0.8819, + "mean_token_accuracy": 0.7274300456047058, + "num_tokens": 447079113.0, + "step": 17651 + }, + { + "epoch": 1.9385020865363498, + "grad_norm": 2.416139602661133, + "learning_rate": 1e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.7273926734924316, + "num_tokens": 447102758.0, + "step": 17652 + }, + { + "epoch": 1.9386119042389633, + "grad_norm": 2.384521484375, + "learning_rate": 1e-06, + "loss": 0.934, + "mean_token_accuracy": 0.7159996032714844, + "num_tokens": 447125903.0, + "step": 17653 + }, + { + "epoch": 1.9387217219415769, + "grad_norm": 2.2291760444641113, + "learning_rate": 1e-06, + "loss": 0.8434, + "mean_token_accuracy": 0.7359926700592041, + "num_tokens": 447150474.0, + "step": 17654 + }, + { + "epoch": 1.9388315396441906, + "grad_norm": 2.1718363761901855, + "learning_rate": 1e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.702967643737793, + "num_tokens": 447178771.0, + "step": 17655 + }, + { + "epoch": 1.9389413573468044, + "grad_norm": 2.557589292526245, + "learning_rate": 1e-06, + "loss": 0.8136, + "mean_token_accuracy": 0.741863489151001, + "num_tokens": 447199250.0, + "step": 17656 + }, + { + "epoch": 1.939051175049418, + "grad_norm": 2.2909815311431885, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.701759934425354, + "num_tokens": 447223744.0, + "step": 17657 + }, + { + "epoch": 1.9391609927520315, + "grad_norm": 2.513612747192383, + "learning_rate": 1e-06, + "loss": 0.964, + "mean_token_accuracy": 0.710117518901825, + "num_tokens": 447245088.0, + "step": 17658 + }, + { + "epoch": 1.9392708104546452, + "grad_norm": 1.832162618637085, + "learning_rate": 1e-06, + "loss": 0.9215, + "mean_token_accuracy": 0.7231271266937256, + "num_tokens": 447277051.0, + "step": 17659 + }, + { + "epoch": 1.939380628157259, + "grad_norm": 2.1058170795440674, + "learning_rate": 1e-06, + "loss": 1.0161, + "mean_token_accuracy": 0.6998875141143799, + "num_tokens": 447306204.0, + "step": 17660 + }, + { + "epoch": 1.9394904458598727, + "grad_norm": 2.297208070755005, + "learning_rate": 1e-06, + "loss": 0.8364, + "mean_token_accuracy": 0.7422586679458618, + "num_tokens": 447328939.0, + "step": 17661 + }, + { + "epoch": 1.9396002635624863, + "grad_norm": 2.4641635417938232, + "learning_rate": 1e-06, + "loss": 0.8733, + "mean_token_accuracy": 0.7293897271156311, + "num_tokens": 447350324.0, + "step": 17662 + }, + { + "epoch": 1.9397100812650998, + "grad_norm": 2.5002388954162598, + "learning_rate": 1e-06, + "loss": 0.7102, + "mean_token_accuracy": 0.7689187526702881, + "num_tokens": 447369183.0, + "step": 17663 + }, + { + "epoch": 1.9398198989677136, + "grad_norm": 2.4294376373291016, + "learning_rate": 1e-06, + "loss": 0.6966, + "mean_token_accuracy": 0.7691208720207214, + "num_tokens": 447388508.0, + "step": 17664 + }, + { + "epoch": 1.9399297166703273, + "grad_norm": 2.2912962436676025, + "learning_rate": 1e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.703961968421936, + "num_tokens": 447412518.0, + "step": 17665 + }, + { + "epoch": 1.940039534372941, + "grad_norm": 2.057156562805176, + "learning_rate": 1e-06, + "loss": 0.8499, + "mean_token_accuracy": 0.7411314249038696, + "num_tokens": 447441642.0, + "step": 17666 + }, + { + "epoch": 1.9401493520755546, + "grad_norm": 2.2832722663879395, + "learning_rate": 1e-06, + "loss": 0.8969, + "mean_token_accuracy": 0.7159552574157715, + "num_tokens": 447467431.0, + "step": 17667 + }, + { + "epoch": 1.9402591697781681, + "grad_norm": 2.4436469078063965, + "learning_rate": 1e-06, + "loss": 0.9128, + "mean_token_accuracy": 0.7229827642440796, + "num_tokens": 447489875.0, + "step": 17668 + }, + { + "epoch": 1.940368987480782, + "grad_norm": 2.4796230792999268, + "learning_rate": 1e-06, + "loss": 0.8308, + "mean_token_accuracy": 0.7437302470207214, + "num_tokens": 447512054.0, + "step": 17669 + }, + { + "epoch": 1.9404788051833957, + "grad_norm": 2.6437649726867676, + "learning_rate": 1e-06, + "loss": 0.862, + "mean_token_accuracy": 0.7299656867980957, + "num_tokens": 447532014.0, + "step": 17670 + }, + { + "epoch": 1.9405886228860092, + "grad_norm": 2.1683287620544434, + "learning_rate": 1e-06, + "loss": 0.943, + "mean_token_accuracy": 0.7065116167068481, + "num_tokens": 447558839.0, + "step": 17671 + }, + { + "epoch": 1.9406984405886227, + "grad_norm": 2.346846103668213, + "learning_rate": 1e-06, + "loss": 0.8798, + "mean_token_accuracy": 0.7196257710456848, + "num_tokens": 447581207.0, + "step": 17672 + }, + { + "epoch": 1.9408082582912365, + "grad_norm": 2.189603090286255, + "learning_rate": 1e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.7039741277694702, + "num_tokens": 447607352.0, + "step": 17673 + }, + { + "epoch": 1.9409180759938502, + "grad_norm": 2.14770245552063, + "learning_rate": 1e-06, + "loss": 0.9274, + "mean_token_accuracy": 0.71214359998703, + "num_tokens": 447634424.0, + "step": 17674 + }, + { + "epoch": 1.941027893696464, + "grad_norm": 2.125401496887207, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7045373916625977, + "num_tokens": 447662615.0, + "step": 17675 + }, + { + "epoch": 1.9411377113990775, + "grad_norm": 2.3708646297454834, + "learning_rate": 1e-06, + "loss": 0.8693, + "mean_token_accuracy": 0.7273569703102112, + "num_tokens": 447684251.0, + "step": 17676 + }, + { + "epoch": 1.941247529101691, + "grad_norm": 2.190213918685913, + "learning_rate": 1e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7217812538146973, + "num_tokens": 447710388.0, + "step": 17677 + }, + { + "epoch": 1.9413573468043048, + "grad_norm": 2.1466245651245117, + "learning_rate": 1e-06, + "loss": 0.9884, + "mean_token_accuracy": 0.6934986114501953, + "num_tokens": 447739869.0, + "step": 17678 + }, + { + "epoch": 1.9414671645069186, + "grad_norm": 2.4570744037628174, + "learning_rate": 1e-06, + "loss": 0.7625, + "mean_token_accuracy": 0.7555318474769592, + "num_tokens": 447758985.0, + "step": 17679 + }, + { + "epoch": 1.9415769822095323, + "grad_norm": 2.18876051902771, + "learning_rate": 1e-06, + "loss": 0.912, + "mean_token_accuracy": 0.7296514511108398, + "num_tokens": 447784370.0, + "step": 17680 + }, + { + "epoch": 1.9416867999121459, + "grad_norm": 2.4672133922576904, + "learning_rate": 1e-06, + "loss": 0.8592, + "mean_token_accuracy": 0.7296263575553894, + "num_tokens": 447805291.0, + "step": 17681 + }, + { + "epoch": 1.9417966176147594, + "grad_norm": 2.408496856689453, + "learning_rate": 1e-06, + "loss": 0.8937, + "mean_token_accuracy": 0.7330924272537231, + "num_tokens": 447825886.0, + "step": 17682 + }, + { + "epoch": 1.9419064353173732, + "grad_norm": 2.1498095989227295, + "learning_rate": 1e-06, + "loss": 0.9051, + "mean_token_accuracy": 0.7166595458984375, + "num_tokens": 447855640.0, + "step": 17683 + }, + { + "epoch": 1.942016253019987, + "grad_norm": 2.390620708465576, + "learning_rate": 1e-06, + "loss": 0.8677, + "mean_token_accuracy": 0.7316782474517822, + "num_tokens": 447878325.0, + "step": 17684 + }, + { + "epoch": 1.9421260707226005, + "grad_norm": 2.2179064750671387, + "learning_rate": 1e-06, + "loss": 0.807, + "mean_token_accuracy": 0.7507171630859375, + "num_tokens": 447903337.0, + "step": 17685 + }, + { + "epoch": 1.942235888425214, + "grad_norm": 2.2087268829345703, + "learning_rate": 1e-06, + "loss": 0.8588, + "mean_token_accuracy": 0.730469286441803, + "num_tokens": 447931596.0, + "step": 17686 + }, + { + "epoch": 1.9423457061278278, + "grad_norm": 2.4270434379577637, + "learning_rate": 1e-06, + "loss": 0.8824, + "mean_token_accuracy": 0.7259024381637573, + "num_tokens": 447955352.0, + "step": 17687 + }, + { + "epoch": 1.9424555238304415, + "grad_norm": 2.0468270778656006, + "learning_rate": 1e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.7096390724182129, + "num_tokens": 447984921.0, + "step": 17688 + }, + { + "epoch": 1.9425653415330553, + "grad_norm": 2.614208936691284, + "learning_rate": 1e-06, + "loss": 0.8663, + "mean_token_accuracy": 0.7331593036651611, + "num_tokens": 448004720.0, + "step": 17689 + }, + { + "epoch": 1.9426751592356688, + "grad_norm": 2.0709481239318848, + "learning_rate": 1e-06, + "loss": 0.9065, + "mean_token_accuracy": 0.7244577407836914, + "num_tokens": 448034719.0, + "step": 17690 + }, + { + "epoch": 1.9427849769382823, + "grad_norm": 2.395246744155884, + "learning_rate": 1e-06, + "loss": 0.8369, + "mean_token_accuracy": 0.7328635454177856, + "num_tokens": 448057303.0, + "step": 17691 + }, + { + "epoch": 1.942894794640896, + "grad_norm": 2.0165390968322754, + "learning_rate": 1e-06, + "loss": 0.8883, + "mean_token_accuracy": 0.7212589383125305, + "num_tokens": 448088679.0, + "step": 17692 + }, + { + "epoch": 1.9430046123435099, + "grad_norm": 2.0797526836395264, + "learning_rate": 1e-06, + "loss": 0.9479, + "mean_token_accuracy": 0.7095344066619873, + "num_tokens": 448117963.0, + "step": 17693 + }, + { + "epoch": 1.9431144300461234, + "grad_norm": 1.947747826576233, + "learning_rate": 1e-06, + "loss": 0.8635, + "mean_token_accuracy": 0.7277261018753052, + "num_tokens": 448149862.0, + "step": 17694 + }, + { + "epoch": 1.9432242477487371, + "grad_norm": 2.36911940574646, + "learning_rate": 1e-06, + "loss": 0.8456, + "mean_token_accuracy": 0.7317638397216797, + "num_tokens": 448170847.0, + "step": 17695 + }, + { + "epoch": 1.9433340654513507, + "grad_norm": 2.7114055156707764, + "learning_rate": 1e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.7227121591567993, + "num_tokens": 448189356.0, + "step": 17696 + }, + { + "epoch": 1.9434438831539644, + "grad_norm": 2.3295722007751465, + "learning_rate": 1e-06, + "loss": 0.8866, + "mean_token_accuracy": 0.7233139276504517, + "num_tokens": 448213733.0, + "step": 17697 + }, + { + "epoch": 1.9435537008565782, + "grad_norm": 2.2944657802581787, + "learning_rate": 1e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7129957675933838, + "num_tokens": 448239762.0, + "step": 17698 + }, + { + "epoch": 1.9436635185591917, + "grad_norm": 2.2018637657165527, + "learning_rate": 1e-06, + "loss": 0.8397, + "mean_token_accuracy": 0.7437655329704285, + "num_tokens": 448263564.0, + "step": 17699 + }, + { + "epoch": 1.9437733362618053, + "grad_norm": 2.0582051277160645, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7040297389030457, + "num_tokens": 448293793.0, + "step": 17700 + }, + { + "epoch": 1.943883153964419, + "grad_norm": 2.330033302307129, + "learning_rate": 1e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.7205418348312378, + "num_tokens": 448316470.0, + "step": 17701 + }, + { + "epoch": 1.9439929716670328, + "grad_norm": 2.0655596256256104, + "learning_rate": 1e-06, + "loss": 0.7744, + "mean_token_accuracy": 0.7502676248550415, + "num_tokens": 448342861.0, + "step": 17702 + }, + { + "epoch": 1.9441027893696465, + "grad_norm": 2.1009304523468018, + "learning_rate": 1e-06, + "loss": 0.935, + "mean_token_accuracy": 0.7094364166259766, + "num_tokens": 448371331.0, + "step": 17703 + }, + { + "epoch": 1.94421260707226, + "grad_norm": 2.2920053005218506, + "learning_rate": 1e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.7079837918281555, + "num_tokens": 448398017.0, + "step": 17704 + }, + { + "epoch": 1.9443224247748736, + "grad_norm": 2.1585094928741455, + "learning_rate": 1e-06, + "loss": 0.8903, + "mean_token_accuracy": 0.7242801189422607, + "num_tokens": 448423690.0, + "step": 17705 + }, + { + "epoch": 1.9444322424774874, + "grad_norm": 2.575716972351074, + "learning_rate": 1e-06, + "loss": 0.8361, + "mean_token_accuracy": 0.7350955009460449, + "num_tokens": 448443191.0, + "step": 17706 + }, + { + "epoch": 1.9445420601801011, + "grad_norm": 2.6407053470611572, + "learning_rate": 1e-06, + "loss": 0.8346, + "mean_token_accuracy": 0.7357007265090942, + "num_tokens": 448462882.0, + "step": 17707 + }, + { + "epoch": 1.9446518778827147, + "grad_norm": 2.2248451709747314, + "learning_rate": 1e-06, + "loss": 0.967, + "mean_token_accuracy": 0.7055583596229553, + "num_tokens": 448490852.0, + "step": 17708 + }, + { + "epoch": 1.9447616955853284, + "grad_norm": 2.421382427215576, + "learning_rate": 1e-06, + "loss": 0.8368, + "mean_token_accuracy": 0.7347730398178101, + "num_tokens": 448511593.0, + "step": 17709 + }, + { + "epoch": 1.944871513287942, + "grad_norm": 2.223968267440796, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.711687445640564, + "num_tokens": 448536957.0, + "step": 17710 + }, + { + "epoch": 1.9449813309905557, + "grad_norm": 2.26725172996521, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.7201979756355286, + "num_tokens": 448561619.0, + "step": 17711 + }, + { + "epoch": 1.9450911486931695, + "grad_norm": 2.3013455867767334, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7189189195632935, + "num_tokens": 448586404.0, + "step": 17712 + }, + { + "epoch": 1.945200966395783, + "grad_norm": 2.26123309135437, + "learning_rate": 1e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.7121098637580872, + "num_tokens": 448612387.0, + "step": 17713 + }, + { + "epoch": 1.9453107840983965, + "grad_norm": 1.9245887994766235, + "learning_rate": 1e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.7156655788421631, + "num_tokens": 448643956.0, + "step": 17714 + }, + { + "epoch": 1.9454206018010103, + "grad_norm": 2.2722461223602295, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7140138745307922, + "num_tokens": 448667435.0, + "step": 17715 + }, + { + "epoch": 1.945530419503624, + "grad_norm": 2.4465553760528564, + "learning_rate": 1e-06, + "loss": 0.9046, + "mean_token_accuracy": 0.7231917381286621, + "num_tokens": 448689983.0, + "step": 17716 + }, + { + "epoch": 1.9456402372062378, + "grad_norm": 2.0253074169158936, + "learning_rate": 1e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.706937313079834, + "num_tokens": 448720914.0, + "step": 17717 + }, + { + "epoch": 1.9457500549088513, + "grad_norm": 2.7444050312042236, + "learning_rate": 1e-06, + "loss": 0.8355, + "mean_token_accuracy": 0.7329442501068115, + "num_tokens": 448739961.0, + "step": 17718 + }, + { + "epoch": 1.9458598726114649, + "grad_norm": 2.0909018516540527, + "learning_rate": 1e-06, + "loss": 0.93, + "mean_token_accuracy": 0.7103889584541321, + "num_tokens": 448769104.0, + "step": 17719 + }, + { + "epoch": 1.9459696903140786, + "grad_norm": 2.5734894275665283, + "learning_rate": 1e-06, + "loss": 0.8543, + "mean_token_accuracy": 0.7250814437866211, + "num_tokens": 448788746.0, + "step": 17720 + }, + { + "epoch": 1.9460795080166924, + "grad_norm": 2.3678290843963623, + "learning_rate": 1e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.7110418081283569, + "num_tokens": 448812613.0, + "step": 17721 + }, + { + "epoch": 1.946189325719306, + "grad_norm": 1.9576128721237183, + "learning_rate": 1e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7201254367828369, + "num_tokens": 448843119.0, + "step": 17722 + }, + { + "epoch": 1.9462991434219195, + "grad_norm": 2.7122750282287598, + "learning_rate": 1e-06, + "loss": 0.832, + "mean_token_accuracy": 0.7334216833114624, + "num_tokens": 448861281.0, + "step": 17723 + }, + { + "epoch": 1.9464089611245332, + "grad_norm": 2.0536246299743652, + "learning_rate": 1e-06, + "loss": 0.816, + "mean_token_accuracy": 0.7426248788833618, + "num_tokens": 448887801.0, + "step": 17724 + }, + { + "epoch": 1.946518778827147, + "grad_norm": 2.4734485149383545, + "learning_rate": 1e-06, + "loss": 0.8292, + "mean_token_accuracy": 0.7393124103546143, + "num_tokens": 448909214.0, + "step": 17725 + }, + { + "epoch": 1.9466285965297607, + "grad_norm": 2.414700746536255, + "learning_rate": 1e-06, + "loss": 0.8576, + "mean_token_accuracy": 0.7315117120742798, + "num_tokens": 448931395.0, + "step": 17726 + }, + { + "epoch": 1.9467384142323743, + "grad_norm": 2.1426377296447754, + "learning_rate": 1e-06, + "loss": 0.8019, + "mean_token_accuracy": 0.7427686452865601, + "num_tokens": 448956265.0, + "step": 17727 + }, + { + "epoch": 1.9468482319349878, + "grad_norm": 2.2469661235809326, + "learning_rate": 1e-06, + "loss": 0.8601, + "mean_token_accuracy": 0.7240477204322815, + "num_tokens": 448981539.0, + "step": 17728 + }, + { + "epoch": 1.9469580496376016, + "grad_norm": 2.2008557319641113, + "learning_rate": 1e-06, + "loss": 0.9038, + "mean_token_accuracy": 0.738800585269928, + "num_tokens": 449006727.0, + "step": 17729 + }, + { + "epoch": 1.9470678673402153, + "grad_norm": 2.8490705490112305, + "learning_rate": 1e-06, + "loss": 0.7988, + "mean_token_accuracy": 0.7456365823745728, + "num_tokens": 449023938.0, + "step": 17730 + }, + { + "epoch": 1.947177685042829, + "grad_norm": 2.4031834602355957, + "learning_rate": 1e-06, + "loss": 0.9368, + "mean_token_accuracy": 0.7053912878036499, + "num_tokens": 449048586.0, + "step": 17731 + }, + { + "epoch": 1.9472875027454426, + "grad_norm": 2.475559949874878, + "learning_rate": 1e-06, + "loss": 0.8508, + "mean_token_accuracy": 0.7299434542655945, + "num_tokens": 449070253.0, + "step": 17732 + }, + { + "epoch": 1.9473973204480561, + "grad_norm": 2.2962398529052734, + "learning_rate": 1e-06, + "loss": 0.8689, + "mean_token_accuracy": 0.7263214588165283, + "num_tokens": 449095557.0, + "step": 17733 + }, + { + "epoch": 1.94750713815067, + "grad_norm": 2.411144733428955, + "learning_rate": 1e-06, + "loss": 0.7214, + "mean_token_accuracy": 0.7705528736114502, + "num_tokens": 449116553.0, + "step": 17734 + }, + { + "epoch": 1.9476169558532836, + "grad_norm": 2.580843925476074, + "learning_rate": 1e-06, + "loss": 0.7321, + "mean_token_accuracy": 0.7602839469909668, + "num_tokens": 449134518.0, + "step": 17735 + }, + { + "epoch": 1.9477267735558972, + "grad_norm": 2.2186684608459473, + "learning_rate": 1e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.7116851806640625, + "num_tokens": 449159322.0, + "step": 17736 + }, + { + "epoch": 1.9478365912585107, + "grad_norm": 2.5547094345092773, + "learning_rate": 1e-06, + "loss": 0.8605, + "mean_token_accuracy": 0.7219935059547424, + "num_tokens": 449180730.0, + "step": 17737 + }, + { + "epoch": 1.9479464089611245, + "grad_norm": 2.057152509689331, + "learning_rate": 1e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.7147217392921448, + "num_tokens": 449208765.0, + "step": 17738 + }, + { + "epoch": 1.9480562266637382, + "grad_norm": 2.2848801612854004, + "learning_rate": 1e-06, + "loss": 0.8959, + "mean_token_accuracy": 0.7244125604629517, + "num_tokens": 449234167.0, + "step": 17739 + }, + { + "epoch": 1.948166044366352, + "grad_norm": 2.446103096008301, + "learning_rate": 1e-06, + "loss": 0.902, + "mean_token_accuracy": 0.7135723233222961, + "num_tokens": 449256750.0, + "step": 17740 + }, + { + "epoch": 1.9482758620689655, + "grad_norm": 2.167233943939209, + "learning_rate": 1e-06, + "loss": 0.8472, + "mean_token_accuracy": 0.7342625856399536, + "num_tokens": 449281539.0, + "step": 17741 + }, + { + "epoch": 1.948385679771579, + "grad_norm": 2.1450116634368896, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.7171575427055359, + "num_tokens": 449307427.0, + "step": 17742 + }, + { + "epoch": 1.9484954974741928, + "grad_norm": 2.398212432861328, + "learning_rate": 1e-06, + "loss": 0.8273, + "mean_token_accuracy": 0.7417914867401123, + "num_tokens": 449329513.0, + "step": 17743 + }, + { + "epoch": 1.9486053151768066, + "grad_norm": 2.217226505279541, + "learning_rate": 1e-06, + "loss": 0.8517, + "mean_token_accuracy": 0.7371697425842285, + "num_tokens": 449356892.0, + "step": 17744 + }, + { + "epoch": 1.94871513287942, + "grad_norm": 2.2373993396759033, + "learning_rate": 1e-06, + "loss": 0.8326, + "mean_token_accuracy": 0.736905038356781, + "num_tokens": 449380996.0, + "step": 17745 + }, + { + "epoch": 1.9488249505820339, + "grad_norm": 2.136193037033081, + "learning_rate": 1e-06, + "loss": 0.8891, + "mean_token_accuracy": 0.7277978658676147, + "num_tokens": 449408285.0, + "step": 17746 + }, + { + "epoch": 1.9489347682846474, + "grad_norm": 2.106121063232422, + "learning_rate": 1e-06, + "loss": 0.874, + "mean_token_accuracy": 0.7288353443145752, + "num_tokens": 449436249.0, + "step": 17747 + }, + { + "epoch": 1.9490445859872612, + "grad_norm": 2.3224148750305176, + "learning_rate": 1e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7231204509735107, + "num_tokens": 449460072.0, + "step": 17748 + }, + { + "epoch": 1.949154403689875, + "grad_norm": 2.4650158882141113, + "learning_rate": 1e-06, + "loss": 0.8472, + "mean_token_accuracy": 0.7316974997520447, + "num_tokens": 449481360.0, + "step": 17749 + }, + { + "epoch": 1.9492642213924884, + "grad_norm": 1.9029258489608765, + "learning_rate": 1e-06, + "loss": 0.8374, + "mean_token_accuracy": 0.7368797063827515, + "num_tokens": 449513898.0, + "step": 17750 + }, + { + "epoch": 1.949374039095102, + "grad_norm": 2.5842413902282715, + "learning_rate": 1e-06, + "loss": 0.8702, + "mean_token_accuracy": 0.7234029769897461, + "num_tokens": 449541297.0, + "step": 17751 + }, + { + "epoch": 1.9494838567977157, + "grad_norm": 2.632453680038452, + "learning_rate": 1e-06, + "loss": 0.7987, + "mean_token_accuracy": 0.741865873336792, + "num_tokens": 449560207.0, + "step": 17752 + }, + { + "epoch": 1.9495936745003295, + "grad_norm": 2.278923273086548, + "learning_rate": 1e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.7039067149162292, + "num_tokens": 449586735.0, + "step": 17753 + }, + { + "epoch": 1.9497034922029433, + "grad_norm": 2.4611804485321045, + "learning_rate": 1e-06, + "loss": 0.8166, + "mean_token_accuracy": 0.7411539554595947, + "num_tokens": 449606853.0, + "step": 17754 + }, + { + "epoch": 1.9498133099055568, + "grad_norm": 2.1659274101257324, + "learning_rate": 1e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.7155836224555969, + "num_tokens": 449631890.0, + "step": 17755 + }, + { + "epoch": 1.9499231276081703, + "grad_norm": 1.8869431018829346, + "learning_rate": 1e-06, + "loss": 0.9378, + "mean_token_accuracy": 0.7069950699806213, + "num_tokens": 449665859.0, + "step": 17756 + }, + { + "epoch": 1.950032945310784, + "grad_norm": 2.564488172531128, + "learning_rate": 1e-06, + "loss": 0.8078, + "mean_token_accuracy": 0.7441996335983276, + "num_tokens": 449684767.0, + "step": 17757 + }, + { + "epoch": 1.9501427630133978, + "grad_norm": 2.1922481060028076, + "learning_rate": 1e-06, + "loss": 0.8836, + "mean_token_accuracy": 0.7245280742645264, + "num_tokens": 449714616.0, + "step": 17758 + }, + { + "epoch": 1.9502525807160114, + "grad_norm": 2.3344593048095703, + "learning_rate": 1e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7087877988815308, + "num_tokens": 449737451.0, + "step": 17759 + }, + { + "epoch": 1.9503623984186251, + "grad_norm": 2.289546489715576, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.6893976926803589, + "num_tokens": 449763602.0, + "step": 17760 + }, + { + "epoch": 1.9504722161212387, + "grad_norm": 2.4620063304901123, + "learning_rate": 1e-06, + "loss": 0.8153, + "mean_token_accuracy": 0.7358307242393494, + "num_tokens": 449785070.0, + "step": 17761 + }, + { + "epoch": 1.9505820338238524, + "grad_norm": 2.2428295612335205, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7056037187576294, + "num_tokens": 449809952.0, + "step": 17762 + }, + { + "epoch": 1.9506918515264662, + "grad_norm": 1.929954171180725, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7071108818054199, + "num_tokens": 449842415.0, + "step": 17763 + }, + { + "epoch": 1.9508016692290797, + "grad_norm": 2.1565537452697754, + "learning_rate": 1e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.7195650339126587, + "num_tokens": 449870135.0, + "step": 17764 + }, + { + "epoch": 1.9509114869316933, + "grad_norm": 2.071075201034546, + "learning_rate": 1e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.7093386054039001, + "num_tokens": 449898280.0, + "step": 17765 + }, + { + "epoch": 1.951021304634307, + "grad_norm": 2.2124617099761963, + "learning_rate": 1e-06, + "loss": 0.8099, + "mean_token_accuracy": 0.7438631057739258, + "num_tokens": 449923643.0, + "step": 17766 + }, + { + "epoch": 1.9511311223369208, + "grad_norm": 2.3796794414520264, + "learning_rate": 1e-06, + "loss": 0.7873, + "mean_token_accuracy": 0.7412572503089905, + "num_tokens": 449946617.0, + "step": 17767 + }, + { + "epoch": 1.9512409400395345, + "grad_norm": 2.595762252807617, + "learning_rate": 1e-06, + "loss": 0.8693, + "mean_token_accuracy": 0.7223091125488281, + "num_tokens": 449967682.0, + "step": 17768 + }, + { + "epoch": 1.951350757742148, + "grad_norm": 2.4194741249084473, + "learning_rate": 1e-06, + "loss": 0.8183, + "mean_token_accuracy": 0.7347281575202942, + "num_tokens": 449990566.0, + "step": 17769 + }, + { + "epoch": 1.9514605754447616, + "grad_norm": 2.192646026611328, + "learning_rate": 1e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.7181450128555298, + "num_tokens": 450018244.0, + "step": 17770 + }, + { + "epoch": 1.9515703931473753, + "grad_norm": 2.33148193359375, + "learning_rate": 1e-06, + "loss": 0.8838, + "mean_token_accuracy": 0.7233068346977234, + "num_tokens": 450042412.0, + "step": 17771 + }, + { + "epoch": 1.951680210849989, + "grad_norm": 1.9766855239868164, + "learning_rate": 1e-06, + "loss": 0.9082, + "mean_token_accuracy": 0.7240391969680786, + "num_tokens": 450074812.0, + "step": 17772 + }, + { + "epoch": 1.9517900285526026, + "grad_norm": 2.3961338996887207, + "learning_rate": 1e-06, + "loss": 0.7371, + "mean_token_accuracy": 0.7608786821365356, + "num_tokens": 450095172.0, + "step": 17773 + }, + { + "epoch": 1.9518998462552162, + "grad_norm": 2.0132555961608887, + "learning_rate": 1e-06, + "loss": 0.8438, + "mean_token_accuracy": 0.7428063154220581, + "num_tokens": 450123543.0, + "step": 17774 + }, + { + "epoch": 1.95200966395783, + "grad_norm": 2.2199976444244385, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.7289198637008667, + "num_tokens": 450148757.0, + "step": 17775 + }, + { + "epoch": 1.9521194816604437, + "grad_norm": 1.9770902395248413, + "learning_rate": 1e-06, + "loss": 1.0223, + "mean_token_accuracy": 0.688399612903595, + "num_tokens": 450181002.0, + "step": 17776 + }, + { + "epoch": 1.9522292993630574, + "grad_norm": 2.7763938903808594, + "learning_rate": 1e-06, + "loss": 0.7784, + "mean_token_accuracy": 0.7520297765731812, + "num_tokens": 450198458.0, + "step": 17777 + }, + { + "epoch": 1.952339117065671, + "grad_norm": 2.034468650817871, + "learning_rate": 1e-06, + "loss": 0.8195, + "mean_token_accuracy": 0.7403560876846313, + "num_tokens": 450228242.0, + "step": 17778 + }, + { + "epoch": 1.9524489347682845, + "grad_norm": 2.0795323848724365, + "learning_rate": 1e-06, + "loss": 0.8937, + "mean_token_accuracy": 0.7231107950210571, + "num_tokens": 450255811.0, + "step": 17779 + }, + { + "epoch": 1.9525587524708983, + "grad_norm": 2.175220489501953, + "learning_rate": 1e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.7255352735519409, + "num_tokens": 450281725.0, + "step": 17780 + }, + { + "epoch": 1.952668570173512, + "grad_norm": 2.5382893085479736, + "learning_rate": 1e-06, + "loss": 0.8636, + "mean_token_accuracy": 0.7242755889892578, + "num_tokens": 450302894.0, + "step": 17781 + }, + { + "epoch": 1.9527783878761258, + "grad_norm": 2.2004928588867188, + "learning_rate": 1e-06, + "loss": 0.8409, + "mean_token_accuracy": 0.7378798723220825, + "num_tokens": 450329405.0, + "step": 17782 + }, + { + "epoch": 1.9528882055787393, + "grad_norm": 2.533450126647949, + "learning_rate": 1e-06, + "loss": 0.8051, + "mean_token_accuracy": 0.7465577721595764, + "num_tokens": 450350706.0, + "step": 17783 + }, + { + "epoch": 1.9529980232813529, + "grad_norm": 2.0634419918060303, + "learning_rate": 1e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7143216729164124, + "num_tokens": 450381923.0, + "step": 17784 + }, + { + "epoch": 1.9531078409839666, + "grad_norm": 2.4958317279815674, + "learning_rate": 1e-06, + "loss": 0.7863, + "mean_token_accuracy": 0.7582042217254639, + "num_tokens": 450403000.0, + "step": 17785 + }, + { + "epoch": 1.9532176586865804, + "grad_norm": 2.2877588272094727, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7072362899780273, + "num_tokens": 450429206.0, + "step": 17786 + }, + { + "epoch": 1.953327476389194, + "grad_norm": 2.2422215938568115, + "learning_rate": 1e-06, + "loss": 0.8517, + "mean_token_accuracy": 0.7406681180000305, + "num_tokens": 450455096.0, + "step": 17787 + }, + { + "epoch": 1.9534372940918074, + "grad_norm": 2.0880696773529053, + "learning_rate": 1e-06, + "loss": 0.8701, + "mean_token_accuracy": 0.7237813472747803, + "num_tokens": 450485629.0, + "step": 17788 + }, + { + "epoch": 1.9535471117944212, + "grad_norm": 2.0053210258483887, + "learning_rate": 1e-06, + "loss": 0.9237, + "mean_token_accuracy": 0.7109932899475098, + "num_tokens": 450515835.0, + "step": 17789 + }, + { + "epoch": 1.953656929497035, + "grad_norm": 2.0365519523620605, + "learning_rate": 1e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.7037513852119446, + "num_tokens": 450545270.0, + "step": 17790 + }, + { + "epoch": 1.9537667471996487, + "grad_norm": 2.4351727962493896, + "learning_rate": 1e-06, + "loss": 0.8315, + "mean_token_accuracy": 0.7413435578346252, + "num_tokens": 450567843.0, + "step": 17791 + }, + { + "epoch": 1.9538765649022622, + "grad_norm": 2.197406053543091, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.6991056203842163, + "num_tokens": 450596428.0, + "step": 17792 + }, + { + "epoch": 1.9539863826048758, + "grad_norm": 2.222038507461548, + "learning_rate": 1e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.7207121849060059, + "num_tokens": 450624045.0, + "step": 17793 + }, + { + "epoch": 1.9540962003074895, + "grad_norm": 2.2883548736572266, + "learning_rate": 1e-06, + "loss": 0.839, + "mean_token_accuracy": 0.7411280870437622, + "num_tokens": 450645482.0, + "step": 17794 + }, + { + "epoch": 1.9542060180101033, + "grad_norm": 2.3419363498687744, + "learning_rate": 1e-06, + "loss": 0.8317, + "mean_token_accuracy": 0.7407869696617126, + "num_tokens": 450669323.0, + "step": 17795 + }, + { + "epoch": 1.954315835712717, + "grad_norm": 2.005167245864868, + "learning_rate": 1e-06, + "loss": 0.9183, + "mean_token_accuracy": 0.7224234342575073, + "num_tokens": 450700949.0, + "step": 17796 + }, + { + "epoch": 1.9544256534153306, + "grad_norm": 2.1621718406677246, + "learning_rate": 1e-06, + "loss": 0.9958, + "mean_token_accuracy": 0.7022838592529297, + "num_tokens": 450729310.0, + "step": 17797 + }, + { + "epoch": 1.9545354711179441, + "grad_norm": 2.446434497833252, + "learning_rate": 1e-06, + "loss": 0.8314, + "mean_token_accuracy": 0.7390260100364685, + "num_tokens": 450751775.0, + "step": 17798 + }, + { + "epoch": 1.9546452888205579, + "grad_norm": 2.2777745723724365, + "learning_rate": 1e-06, + "loss": 0.8719, + "mean_token_accuracy": 0.725609302520752, + "num_tokens": 450776163.0, + "step": 17799 + }, + { + "epoch": 1.9547551065231716, + "grad_norm": 2.090886354446411, + "learning_rate": 1e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.6899451017379761, + "num_tokens": 450805858.0, + "step": 17800 + }, + { + "epoch": 1.9548649242257852, + "grad_norm": 2.2701163291931152, + "learning_rate": 1e-06, + "loss": 0.9162, + "mean_token_accuracy": 0.7131757140159607, + "num_tokens": 450832847.0, + "step": 17801 + }, + { + "epoch": 1.9549747419283987, + "grad_norm": 2.3896231651306152, + "learning_rate": 1e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.7022101879119873, + "num_tokens": 450857939.0, + "step": 17802 + }, + { + "epoch": 1.9550845596310125, + "grad_norm": 2.3395602703094482, + "learning_rate": 1e-06, + "loss": 0.8141, + "mean_token_accuracy": 0.7396702766418457, + "num_tokens": 450881571.0, + "step": 17803 + }, + { + "epoch": 1.9551943773336262, + "grad_norm": 2.222947359085083, + "learning_rate": 1e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7112337350845337, + "num_tokens": 450907992.0, + "step": 17804 + }, + { + "epoch": 1.95530419503624, + "grad_norm": 2.268739938735962, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.716208815574646, + "num_tokens": 450935162.0, + "step": 17805 + }, + { + "epoch": 1.9554140127388535, + "grad_norm": 2.2702057361602783, + "learning_rate": 1e-06, + "loss": 0.9534, + "mean_token_accuracy": 0.6994288563728333, + "num_tokens": 450961971.0, + "step": 17806 + }, + { + "epoch": 1.955523830441467, + "grad_norm": 2.1074492931365967, + "learning_rate": 1e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.6982778310775757, + "num_tokens": 450992248.0, + "step": 17807 + }, + { + "epoch": 1.9556336481440808, + "grad_norm": 2.449540376663208, + "learning_rate": 1e-06, + "loss": 0.8664, + "mean_token_accuracy": 0.7279763221740723, + "num_tokens": 451014901.0, + "step": 17808 + }, + { + "epoch": 1.9557434658466946, + "grad_norm": 2.202808141708374, + "learning_rate": 1e-06, + "loss": 0.8682, + "mean_token_accuracy": 0.7344734072685242, + "num_tokens": 451040392.0, + "step": 17809 + }, + { + "epoch": 1.955853283549308, + "grad_norm": 2.1976418495178223, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7060518264770508, + "num_tokens": 451067931.0, + "step": 17810 + }, + { + "epoch": 1.9559631012519219, + "grad_norm": 2.635197401046753, + "learning_rate": 1e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.7240683436393738, + "num_tokens": 451086870.0, + "step": 17811 + }, + { + "epoch": 1.9560729189545354, + "grad_norm": 2.107483148574829, + "learning_rate": 1e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.7095800638198853, + "num_tokens": 451115176.0, + "step": 17812 + }, + { + "epoch": 1.9561827366571491, + "grad_norm": 2.2225401401519775, + "learning_rate": 1e-06, + "loss": 0.9856, + "mean_token_accuracy": 0.6967366933822632, + "num_tokens": 451143236.0, + "step": 17813 + }, + { + "epoch": 1.956292554359763, + "grad_norm": 2.1883652210235596, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7127238512039185, + "num_tokens": 451168624.0, + "step": 17814 + }, + { + "epoch": 1.9564023720623764, + "grad_norm": 2.361966848373413, + "learning_rate": 1e-06, + "loss": 0.9276, + "mean_token_accuracy": 0.7237746715545654, + "num_tokens": 451191351.0, + "step": 17815 + }, + { + "epoch": 1.95651218976499, + "grad_norm": 2.269366502761841, + "learning_rate": 1e-06, + "loss": 0.9765, + "mean_token_accuracy": 0.6973415613174438, + "num_tokens": 451217133.0, + "step": 17816 + }, + { + "epoch": 1.9566220074676037, + "grad_norm": 2.2013373374938965, + "learning_rate": 1e-06, + "loss": 0.9003, + "mean_token_accuracy": 0.7232666611671448, + "num_tokens": 451242460.0, + "step": 17817 + }, + { + "epoch": 1.9567318251702175, + "grad_norm": 2.357621669769287, + "learning_rate": 1e-06, + "loss": 0.8539, + "mean_token_accuracy": 0.7297346591949463, + "num_tokens": 451266175.0, + "step": 17818 + }, + { + "epoch": 1.9568416428728312, + "grad_norm": 2.352935552597046, + "learning_rate": 1e-06, + "loss": 0.8813, + "mean_token_accuracy": 0.7238608598709106, + "num_tokens": 451290275.0, + "step": 17819 + }, + { + "epoch": 1.9569514605754448, + "grad_norm": 2.051954507827759, + "learning_rate": 1e-06, + "loss": 0.9534, + "mean_token_accuracy": 0.7027288675308228, + "num_tokens": 451323914.0, + "step": 17820 + }, + { + "epoch": 1.9570612782780583, + "grad_norm": 2.5738327503204346, + "learning_rate": 1e-06, + "loss": 0.7708, + "mean_token_accuracy": 0.7519458532333374, + "num_tokens": 451343426.0, + "step": 17821 + }, + { + "epoch": 1.957171095980672, + "grad_norm": 2.3406898975372314, + "learning_rate": 1e-06, + "loss": 0.8718, + "mean_token_accuracy": 0.7209470272064209, + "num_tokens": 451365985.0, + "step": 17822 + }, + { + "epoch": 1.9572809136832858, + "grad_norm": 2.5565247535705566, + "learning_rate": 1e-06, + "loss": 0.7617, + "mean_token_accuracy": 0.7490004301071167, + "num_tokens": 451385280.0, + "step": 17823 + }, + { + "epoch": 1.9573907313858994, + "grad_norm": 2.409001588821411, + "learning_rate": 1e-06, + "loss": 0.8104, + "mean_token_accuracy": 0.7441344261169434, + "num_tokens": 451406574.0, + "step": 17824 + }, + { + "epoch": 1.9575005490885131, + "grad_norm": 2.5154151916503906, + "learning_rate": 1e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.7201083898544312, + "num_tokens": 451427482.0, + "step": 17825 + }, + { + "epoch": 1.9576103667911267, + "grad_norm": 2.5112650394439697, + "learning_rate": 1e-06, + "loss": 0.8147, + "mean_token_accuracy": 0.7455381751060486, + "num_tokens": 451448120.0, + "step": 17826 + }, + { + "epoch": 1.9577201844937404, + "grad_norm": 2.1452105045318604, + "learning_rate": 1e-06, + "loss": 0.8347, + "mean_token_accuracy": 0.7348127961158752, + "num_tokens": 451474122.0, + "step": 17827 + }, + { + "epoch": 1.9578300021963542, + "grad_norm": 2.5136983394622803, + "learning_rate": 1e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.6981387734413147, + "num_tokens": 451496348.0, + "step": 17828 + }, + { + "epoch": 1.9579398198989677, + "grad_norm": 2.0727155208587646, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.7064261436462402, + "num_tokens": 451524825.0, + "step": 17829 + }, + { + "epoch": 1.9580496376015812, + "grad_norm": 2.387009382247925, + "learning_rate": 1e-06, + "loss": 0.8561, + "mean_token_accuracy": 0.7341618537902832, + "num_tokens": 451547414.0, + "step": 17830 + }, + { + "epoch": 1.958159455304195, + "grad_norm": 2.300891637802124, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.7045814990997314, + "num_tokens": 451573521.0, + "step": 17831 + }, + { + "epoch": 1.9582692730068088, + "grad_norm": 2.3418080806732178, + "learning_rate": 1e-06, + "loss": 0.8711, + "mean_token_accuracy": 0.7310250401496887, + "num_tokens": 451595247.0, + "step": 17832 + }, + { + "epoch": 1.9583790907094225, + "grad_norm": 1.9221152067184448, + "learning_rate": 1e-06, + "loss": 0.9088, + "mean_token_accuracy": 0.720822811126709, + "num_tokens": 451628742.0, + "step": 17833 + }, + { + "epoch": 1.958488908412036, + "grad_norm": 2.147557497024536, + "learning_rate": 1e-06, + "loss": 0.8221, + "mean_token_accuracy": 0.7386199831962585, + "num_tokens": 451655470.0, + "step": 17834 + }, + { + "epoch": 1.9585987261146496, + "grad_norm": 2.118959903717041, + "learning_rate": 1e-06, + "loss": 0.9001, + "mean_token_accuracy": 0.7166704535484314, + "num_tokens": 451682733.0, + "step": 17835 + }, + { + "epoch": 1.9587085438172633, + "grad_norm": 2.334545612335205, + "learning_rate": 1e-06, + "loss": 0.8924, + "mean_token_accuracy": 0.725611686706543, + "num_tokens": 451707220.0, + "step": 17836 + }, + { + "epoch": 1.958818361519877, + "grad_norm": 2.0502240657806396, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7117400169372559, + "num_tokens": 451737177.0, + "step": 17837 + }, + { + "epoch": 1.9589281792224906, + "grad_norm": 2.0452139377593994, + "learning_rate": 1e-06, + "loss": 0.9131, + "mean_token_accuracy": 0.7127465009689331, + "num_tokens": 451766736.0, + "step": 17838 + }, + { + "epoch": 1.9590379969251042, + "grad_norm": 2.190217971801758, + "learning_rate": 1e-06, + "loss": 0.8546, + "mean_token_accuracy": 0.727271318435669, + "num_tokens": 451792591.0, + "step": 17839 + }, + { + "epoch": 1.959147814627718, + "grad_norm": 2.110799551010132, + "learning_rate": 1e-06, + "loss": 0.8733, + "mean_token_accuracy": 0.7276037335395813, + "num_tokens": 451818127.0, + "step": 17840 + }, + { + "epoch": 1.9592576323303317, + "grad_norm": 2.0600152015686035, + "learning_rate": 1e-06, + "loss": 0.8273, + "mean_token_accuracy": 0.7424077987670898, + "num_tokens": 451846100.0, + "step": 17841 + }, + { + "epoch": 1.9593674500329454, + "grad_norm": 2.1561455726623535, + "learning_rate": 1e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7139506340026855, + "num_tokens": 451874882.0, + "step": 17842 + }, + { + "epoch": 1.959477267735559, + "grad_norm": 2.280134677886963, + "learning_rate": 1e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.7164210677146912, + "num_tokens": 451900154.0, + "step": 17843 + }, + { + "epoch": 1.9595870854381725, + "grad_norm": 2.2478277683258057, + "learning_rate": 1e-06, + "loss": 0.9272, + "mean_token_accuracy": 0.7106614708900452, + "num_tokens": 451926547.0, + "step": 17844 + }, + { + "epoch": 1.9596969031407863, + "grad_norm": 2.0730795860290527, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7088372111320496, + "num_tokens": 451954625.0, + "step": 17845 + }, + { + "epoch": 1.9598067208434, + "grad_norm": 2.3371400833129883, + "learning_rate": 1e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.7070462703704834, + "num_tokens": 451978205.0, + "step": 17846 + }, + { + "epoch": 1.9599165385460138, + "grad_norm": 2.625319719314575, + "learning_rate": 1e-06, + "loss": 0.8584, + "mean_token_accuracy": 0.725736141204834, + "num_tokens": 451997412.0, + "step": 17847 + }, + { + "epoch": 1.9600263562486273, + "grad_norm": 2.3942787647247314, + "learning_rate": 1e-06, + "loss": 0.9209, + "mean_token_accuracy": 0.7125214338302612, + "num_tokens": 452020857.0, + "step": 17848 + }, + { + "epoch": 1.9601361739512408, + "grad_norm": 2.3002848625183105, + "learning_rate": 1e-06, + "loss": 0.8538, + "mean_token_accuracy": 0.7289303541183472, + "num_tokens": 452044643.0, + "step": 17849 + }, + { + "epoch": 1.9602459916538546, + "grad_norm": 2.552670955657959, + "learning_rate": 1e-06, + "loss": 0.8473, + "mean_token_accuracy": 0.7435996532440186, + "num_tokens": 452065004.0, + "step": 17850 + }, + { + "epoch": 1.9603558093564684, + "grad_norm": 2.3381094932556152, + "learning_rate": 1e-06, + "loss": 0.8521, + "mean_token_accuracy": 0.7260083556175232, + "num_tokens": 452088979.0, + "step": 17851 + }, + { + "epoch": 1.960465627059082, + "grad_norm": 2.046287775039673, + "learning_rate": 1e-06, + "loss": 1.0378, + "mean_token_accuracy": 0.6860103011131287, + "num_tokens": 452119504.0, + "step": 17852 + }, + { + "epoch": 1.9605754447616954, + "grad_norm": 2.4116415977478027, + "learning_rate": 1e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.7161069512367249, + "num_tokens": 452141812.0, + "step": 17853 + }, + { + "epoch": 1.9606852624643092, + "grad_norm": 2.3817965984344482, + "learning_rate": 1e-06, + "loss": 0.9001, + "mean_token_accuracy": 0.7185739874839783, + "num_tokens": 452166270.0, + "step": 17854 + }, + { + "epoch": 1.960795080166923, + "grad_norm": 2.2917165756225586, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7139952778816223, + "num_tokens": 452191477.0, + "step": 17855 + }, + { + "epoch": 1.9609048978695367, + "grad_norm": 2.23429536819458, + "learning_rate": 1e-06, + "loss": 0.915, + "mean_token_accuracy": 0.718187153339386, + "num_tokens": 452216738.0, + "step": 17856 + }, + { + "epoch": 1.9610147155721502, + "grad_norm": 2.036972761154175, + "learning_rate": 1e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.7036099433898926, + "num_tokens": 452247821.0, + "step": 17857 + }, + { + "epoch": 1.9611245332747638, + "grad_norm": 2.197185754776001, + "learning_rate": 1e-06, + "loss": 0.8323, + "mean_token_accuracy": 0.740580677986145, + "num_tokens": 452274150.0, + "step": 17858 + }, + { + "epoch": 1.9612343509773775, + "grad_norm": 2.3753821849823, + "learning_rate": 1e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.7103821635246277, + "num_tokens": 452300168.0, + "step": 17859 + }, + { + "epoch": 1.9613441686799913, + "grad_norm": 2.583714485168457, + "learning_rate": 1e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.7211675643920898, + "num_tokens": 452322846.0, + "step": 17860 + }, + { + "epoch": 1.961453986382605, + "grad_norm": 2.2004904747009277, + "learning_rate": 1e-06, + "loss": 0.8886, + "mean_token_accuracy": 0.7226150631904602, + "num_tokens": 452349264.0, + "step": 17861 + }, + { + "epoch": 1.9615638040852186, + "grad_norm": 2.105825185775757, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7123610973358154, + "num_tokens": 452377581.0, + "step": 17862 + }, + { + "epoch": 1.9616736217878321, + "grad_norm": 2.2547061443328857, + "learning_rate": 1e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.717936635017395, + "num_tokens": 452403779.0, + "step": 17863 + }, + { + "epoch": 1.9617834394904459, + "grad_norm": 2.1537392139434814, + "learning_rate": 1e-06, + "loss": 0.9413, + "mean_token_accuracy": 0.7033154964447021, + "num_tokens": 452432876.0, + "step": 17864 + }, + { + "epoch": 1.9618932571930596, + "grad_norm": 2.4187307357788086, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.706641674041748, + "num_tokens": 452456722.0, + "step": 17865 + }, + { + "epoch": 1.9620030748956732, + "grad_norm": 2.2611520290374756, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.703864574432373, + "num_tokens": 452483180.0, + "step": 17866 + }, + { + "epoch": 1.9621128925982867, + "grad_norm": 2.096588611602783, + "learning_rate": 1e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7218541502952576, + "num_tokens": 452509441.0, + "step": 17867 + }, + { + "epoch": 1.9622227103009005, + "grad_norm": 2.141768217086792, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7010021209716797, + "num_tokens": 452536450.0, + "step": 17868 + }, + { + "epoch": 1.9623325280035142, + "grad_norm": 2.550440549850464, + "learning_rate": 1e-06, + "loss": 0.7856, + "mean_token_accuracy": 0.7448784112930298, + "num_tokens": 452556407.0, + "step": 17869 + }, + { + "epoch": 1.962442345706128, + "grad_norm": 1.9014285802841187, + "learning_rate": 1e-06, + "loss": 0.897, + "mean_token_accuracy": 0.7173570990562439, + "num_tokens": 452589412.0, + "step": 17870 + }, + { + "epoch": 1.9625521634087415, + "grad_norm": 2.342682123184204, + "learning_rate": 1e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.7229551076889038, + "num_tokens": 452612300.0, + "step": 17871 + }, + { + "epoch": 1.962661981111355, + "grad_norm": 2.1345458030700684, + "learning_rate": 1e-06, + "loss": 0.893, + "mean_token_accuracy": 0.7290130853652954, + "num_tokens": 452639317.0, + "step": 17872 + }, + { + "epoch": 1.9627717988139688, + "grad_norm": 2.1342074871063232, + "learning_rate": 1e-06, + "loss": 0.8895, + "mean_token_accuracy": 0.7269961833953857, + "num_tokens": 452668486.0, + "step": 17873 + }, + { + "epoch": 1.9628816165165826, + "grad_norm": 2.2564504146575928, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7071356773376465, + "num_tokens": 452694987.0, + "step": 17874 + }, + { + "epoch": 1.962991434219196, + "grad_norm": 2.3274683952331543, + "learning_rate": 1e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.7154484987258911, + "num_tokens": 452720767.0, + "step": 17875 + }, + { + "epoch": 1.9631012519218098, + "grad_norm": 2.9277145862579346, + "learning_rate": 1e-06, + "loss": 0.866, + "mean_token_accuracy": 0.7260609269142151, + "num_tokens": 452738601.0, + "step": 17876 + }, + { + "epoch": 1.9632110696244234, + "grad_norm": 2.580430746078491, + "learning_rate": 1e-06, + "loss": 0.7872, + "mean_token_accuracy": 0.7517579793930054, + "num_tokens": 452759474.0, + "step": 17877 + }, + { + "epoch": 1.9633208873270371, + "grad_norm": 2.450141191482544, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7153635025024414, + "num_tokens": 452784548.0, + "step": 17878 + }, + { + "epoch": 1.963430705029651, + "grad_norm": 2.661510467529297, + "learning_rate": 1e-06, + "loss": 0.8448, + "mean_token_accuracy": 0.7303049564361572, + "num_tokens": 452802979.0, + "step": 17879 + }, + { + "epoch": 1.9635405227322644, + "grad_norm": 2.5274276733398438, + "learning_rate": 1e-06, + "loss": 0.8668, + "mean_token_accuracy": 0.7217700481414795, + "num_tokens": 452824585.0, + "step": 17880 + }, + { + "epoch": 1.963650340434878, + "grad_norm": 2.344235420227051, + "learning_rate": 1e-06, + "loss": 0.844, + "mean_token_accuracy": 0.7362345457077026, + "num_tokens": 452848681.0, + "step": 17881 + }, + { + "epoch": 1.9637601581374917, + "grad_norm": 2.13120698928833, + "learning_rate": 1e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.7224579453468323, + "num_tokens": 452877628.0, + "step": 17882 + }, + { + "epoch": 1.9638699758401055, + "grad_norm": 2.0925090312957764, + "learning_rate": 1e-06, + "loss": 0.8813, + "mean_token_accuracy": 0.7264186143875122, + "num_tokens": 452905351.0, + "step": 17883 + }, + { + "epoch": 1.9639797935427192, + "grad_norm": 2.1797492504119873, + "learning_rate": 1e-06, + "loss": 0.8361, + "mean_token_accuracy": 0.7324140071868896, + "num_tokens": 452930264.0, + "step": 17884 + }, + { + "epoch": 1.9640896112453328, + "grad_norm": 2.212101936340332, + "learning_rate": 1e-06, + "loss": 1.0243, + "mean_token_accuracy": 0.6882737874984741, + "num_tokens": 452957565.0, + "step": 17885 + }, + { + "epoch": 1.9641994289479463, + "grad_norm": 2.4694862365722656, + "learning_rate": 1e-06, + "loss": 0.8626, + "mean_token_accuracy": 0.7304191589355469, + "num_tokens": 452979110.0, + "step": 17886 + }, + { + "epoch": 1.96430924665056, + "grad_norm": 2.5918734073638916, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.708707869052887, + "num_tokens": 453000080.0, + "step": 17887 + }, + { + "epoch": 1.9644190643531738, + "grad_norm": 2.500321865081787, + "learning_rate": 1e-06, + "loss": 0.7899, + "mean_token_accuracy": 0.7498593330383301, + "num_tokens": 453020185.0, + "step": 17888 + }, + { + "epoch": 1.9645288820557874, + "grad_norm": 2.301917314529419, + "learning_rate": 1e-06, + "loss": 0.9203, + "mean_token_accuracy": 0.7142157554626465, + "num_tokens": 453043871.0, + "step": 17889 + }, + { + "epoch": 1.964638699758401, + "grad_norm": 2.4752533435821533, + "learning_rate": 1e-06, + "loss": 0.8927, + "mean_token_accuracy": 0.7270351052284241, + "num_tokens": 453064576.0, + "step": 17890 + }, + { + "epoch": 1.9647485174610146, + "grad_norm": 2.6311302185058594, + "learning_rate": 1e-06, + "loss": 0.8565, + "mean_token_accuracy": 0.7250185012817383, + "num_tokens": 453086077.0, + "step": 17891 + }, + { + "epoch": 1.9648583351636284, + "grad_norm": 2.363097906112671, + "learning_rate": 1e-06, + "loss": 0.8703, + "mean_token_accuracy": 0.7222607731819153, + "num_tokens": 453109293.0, + "step": 17892 + }, + { + "epoch": 1.9649681528662422, + "grad_norm": 2.3489153385162354, + "learning_rate": 1e-06, + "loss": 0.9308, + "mean_token_accuracy": 0.7083532214164734, + "num_tokens": 453133423.0, + "step": 17893 + }, + { + "epoch": 1.9650779705688557, + "grad_norm": 2.263160467147827, + "learning_rate": 1e-06, + "loss": 0.9131, + "mean_token_accuracy": 0.7119977474212646, + "num_tokens": 453157237.0, + "step": 17894 + }, + { + "epoch": 1.9651877882714692, + "grad_norm": 2.1921517848968506, + "learning_rate": 1e-06, + "loss": 0.9172, + "mean_token_accuracy": 0.7168402075767517, + "num_tokens": 453182249.0, + "step": 17895 + }, + { + "epoch": 1.965297605974083, + "grad_norm": 2.458760976791382, + "learning_rate": 1e-06, + "loss": 0.886, + "mean_token_accuracy": 0.7270957231521606, + "num_tokens": 453205071.0, + "step": 17896 + }, + { + "epoch": 1.9654074236766967, + "grad_norm": 2.362340211868286, + "learning_rate": 1e-06, + "loss": 0.8246, + "mean_token_accuracy": 0.7361457347869873, + "num_tokens": 453229512.0, + "step": 17897 + }, + { + "epoch": 1.9655172413793105, + "grad_norm": 2.173379898071289, + "learning_rate": 1e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.7218590974807739, + "num_tokens": 453256742.0, + "step": 17898 + }, + { + "epoch": 1.965627059081924, + "grad_norm": 2.4535346031188965, + "learning_rate": 1e-06, + "loss": 0.7922, + "mean_token_accuracy": 0.7462835907936096, + "num_tokens": 453278295.0, + "step": 17899 + }, + { + "epoch": 1.9657368767845376, + "grad_norm": 2.4747750759124756, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.706536591053009, + "num_tokens": 453300457.0, + "step": 17900 + }, + { + "epoch": 1.9658466944871513, + "grad_norm": 2.2241907119750977, + "learning_rate": 1e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.7249635457992554, + "num_tokens": 453327569.0, + "step": 17901 + }, + { + "epoch": 1.965956512189765, + "grad_norm": 2.2187907695770264, + "learning_rate": 1e-06, + "loss": 0.8512, + "mean_token_accuracy": 0.7342639565467834, + "num_tokens": 453353137.0, + "step": 17902 + }, + { + "epoch": 1.9660663298923786, + "grad_norm": 2.349348783493042, + "learning_rate": 1e-06, + "loss": 0.8753, + "mean_token_accuracy": 0.724733829498291, + "num_tokens": 453377525.0, + "step": 17903 + }, + { + "epoch": 1.9661761475949922, + "grad_norm": 2.5947742462158203, + "learning_rate": 1e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.7141680717468262, + "num_tokens": 453398926.0, + "step": 17904 + }, + { + "epoch": 1.966285965297606, + "grad_norm": 1.8641713857650757, + "learning_rate": 1e-06, + "loss": 1.0089, + "mean_token_accuracy": 0.6897218227386475, + "num_tokens": 453436189.0, + "step": 17905 + }, + { + "epoch": 1.9663957830002197, + "grad_norm": 2.4075255393981934, + "learning_rate": 1e-06, + "loss": 0.7535, + "mean_token_accuracy": 0.7564008235931396, + "num_tokens": 453458087.0, + "step": 17906 + }, + { + "epoch": 1.9665056007028334, + "grad_norm": 2.3041388988494873, + "learning_rate": 1e-06, + "loss": 0.8294, + "mean_token_accuracy": 0.7404241561889648, + "num_tokens": 453481121.0, + "step": 17907 + }, + { + "epoch": 1.966615418405447, + "grad_norm": 2.2731664180755615, + "learning_rate": 1e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.7206249237060547, + "num_tokens": 453505836.0, + "step": 17908 + }, + { + "epoch": 1.9667252361080605, + "grad_norm": 2.0752711296081543, + "learning_rate": 1e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.7148454189300537, + "num_tokens": 453533642.0, + "step": 17909 + }, + { + "epoch": 1.9668350538106742, + "grad_norm": 1.944698452949524, + "learning_rate": 1e-06, + "loss": 1.031, + "mean_token_accuracy": 0.6865315437316895, + "num_tokens": 453568733.0, + "step": 17910 + }, + { + "epoch": 1.966944871513288, + "grad_norm": 2.1929144859313965, + "learning_rate": 1e-06, + "loss": 0.8486, + "mean_token_accuracy": 0.7439820766448975, + "num_tokens": 453594547.0, + "step": 17911 + }, + { + "epoch": 1.9670546892159018, + "grad_norm": 2.257140874862671, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7146803140640259, + "num_tokens": 453620716.0, + "step": 17912 + }, + { + "epoch": 1.9671645069185153, + "grad_norm": 2.0376741886138916, + "learning_rate": 1e-06, + "loss": 0.7249, + "mean_token_accuracy": 0.7640067338943481, + "num_tokens": 453645297.0, + "step": 17913 + }, + { + "epoch": 1.9672743246211288, + "grad_norm": 2.2608156204223633, + "learning_rate": 1e-06, + "loss": 0.8857, + "mean_token_accuracy": 0.734897792339325, + "num_tokens": 453670513.0, + "step": 17914 + }, + { + "epoch": 1.9673841423237426, + "grad_norm": 2.2962334156036377, + "learning_rate": 1e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.705102801322937, + "num_tokens": 453695091.0, + "step": 17915 + }, + { + "epoch": 1.9674939600263563, + "grad_norm": 2.189530611038208, + "learning_rate": 1e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.7214696407318115, + "num_tokens": 453721018.0, + "step": 17916 + }, + { + "epoch": 1.9676037777289699, + "grad_norm": 2.3150718212127686, + "learning_rate": 1e-06, + "loss": 0.8528, + "mean_token_accuracy": 0.7286646366119385, + "num_tokens": 453745617.0, + "step": 17917 + }, + { + "epoch": 1.9677135954315834, + "grad_norm": 2.119868516921997, + "learning_rate": 1e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.7077701091766357, + "num_tokens": 453773635.0, + "step": 17918 + }, + { + "epoch": 1.9678234131341972, + "grad_norm": 2.256434440612793, + "learning_rate": 1e-06, + "loss": 0.8424, + "mean_token_accuracy": 0.7372106909751892, + "num_tokens": 453796730.0, + "step": 17919 + }, + { + "epoch": 1.967933230836811, + "grad_norm": 2.4958620071411133, + "learning_rate": 1e-06, + "loss": 0.776, + "mean_token_accuracy": 0.7499685287475586, + "num_tokens": 453817422.0, + "step": 17920 + }, + { + "epoch": 1.9680430485394247, + "grad_norm": 2.0852510929107666, + "learning_rate": 1e-06, + "loss": 0.7377, + "mean_token_accuracy": 0.7638070583343506, + "num_tokens": 453842792.0, + "step": 17921 + }, + { + "epoch": 1.9681528662420382, + "grad_norm": 2.5040292739868164, + "learning_rate": 1e-06, + "loss": 0.8381, + "mean_token_accuracy": 0.7345090508460999, + "num_tokens": 453863270.0, + "step": 17922 + }, + { + "epoch": 1.9682626839446518, + "grad_norm": 2.005481719970703, + "learning_rate": 1e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.7171676158905029, + "num_tokens": 453893507.0, + "step": 17923 + }, + { + "epoch": 1.9683725016472655, + "grad_norm": 2.2344961166381836, + "learning_rate": 1e-06, + "loss": 0.9042, + "mean_token_accuracy": 0.726317286491394, + "num_tokens": 453921584.0, + "step": 17924 + }, + { + "epoch": 1.9684823193498793, + "grad_norm": 2.2792885303497314, + "learning_rate": 1e-06, + "loss": 0.8587, + "mean_token_accuracy": 0.7400631904602051, + "num_tokens": 453944205.0, + "step": 17925 + }, + { + "epoch": 1.9685921370524928, + "grad_norm": 2.3102970123291016, + "learning_rate": 1e-06, + "loss": 0.8009, + "mean_token_accuracy": 0.7447133660316467, + "num_tokens": 453967519.0, + "step": 17926 + }, + { + "epoch": 1.9687019547551066, + "grad_norm": 2.2010059356689453, + "learning_rate": 1e-06, + "loss": 1.0179, + "mean_token_accuracy": 0.6935625076293945, + "num_tokens": 453997258.0, + "step": 17927 + }, + { + "epoch": 1.96881177245772, + "grad_norm": 2.2723848819732666, + "learning_rate": 1e-06, + "loss": 0.8678, + "mean_token_accuracy": 0.7253386378288269, + "num_tokens": 454022213.0, + "step": 17928 + }, + { + "epoch": 1.9689215901603339, + "grad_norm": 1.9731718301773071, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.7044925689697266, + "num_tokens": 454054268.0, + "step": 17929 + }, + { + "epoch": 1.9690314078629476, + "grad_norm": 2.5044684410095215, + "learning_rate": 1e-06, + "loss": 0.747, + "mean_token_accuracy": 0.7627931237220764, + "num_tokens": 454074627.0, + "step": 17930 + }, + { + "epoch": 1.9691412255655611, + "grad_norm": 2.26499342918396, + "learning_rate": 1e-06, + "loss": 0.8964, + "mean_token_accuracy": 0.7209035754203796, + "num_tokens": 454099769.0, + "step": 17931 + }, + { + "epoch": 1.9692510432681747, + "grad_norm": 2.271219253540039, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.702713668346405, + "num_tokens": 454125805.0, + "step": 17932 + }, + { + "epoch": 1.9693608609707884, + "grad_norm": 2.756164312362671, + "learning_rate": 1e-06, + "loss": 0.8144, + "mean_token_accuracy": 0.7444149255752563, + "num_tokens": 454145984.0, + "step": 17933 + }, + { + "epoch": 1.9694706786734022, + "grad_norm": 2.0028767585754395, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7075265645980835, + "num_tokens": 454176081.0, + "step": 17934 + }, + { + "epoch": 1.969580496376016, + "grad_norm": 2.2858726978302, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.7100985050201416, + "num_tokens": 454201348.0, + "step": 17935 + }, + { + "epoch": 1.9696903140786295, + "grad_norm": 2.0933985710144043, + "learning_rate": 1e-06, + "loss": 0.9105, + "mean_token_accuracy": 0.7203128337860107, + "num_tokens": 454230145.0, + "step": 17936 + }, + { + "epoch": 1.969800131781243, + "grad_norm": 2.0993456840515137, + "learning_rate": 1e-06, + "loss": 0.8831, + "mean_token_accuracy": 0.7263524532318115, + "num_tokens": 454258251.0, + "step": 17937 + }, + { + "epoch": 1.9699099494838568, + "grad_norm": 2.348019599914551, + "learning_rate": 1e-06, + "loss": 0.8706, + "mean_token_accuracy": 0.7228027582168579, + "num_tokens": 454281876.0, + "step": 17938 + }, + { + "epoch": 1.9700197671864705, + "grad_norm": 2.6954405307769775, + "learning_rate": 1e-06, + "loss": 0.8367, + "mean_token_accuracy": 0.7370816469192505, + "num_tokens": 454300582.0, + "step": 17939 + }, + { + "epoch": 1.970129584889084, + "grad_norm": 2.3676323890686035, + "learning_rate": 1e-06, + "loss": 0.8914, + "mean_token_accuracy": 0.7201468348503113, + "num_tokens": 454323806.0, + "step": 17940 + }, + { + "epoch": 1.9702394025916978, + "grad_norm": 2.136744260787964, + "learning_rate": 1e-06, + "loss": 0.874, + "mean_token_accuracy": 0.7271207571029663, + "num_tokens": 454350836.0, + "step": 17941 + }, + { + "epoch": 1.9703492202943114, + "grad_norm": 2.3186089992523193, + "learning_rate": 1e-06, + "loss": 0.796, + "mean_token_accuracy": 0.7512203454971313, + "num_tokens": 454372539.0, + "step": 17942 + }, + { + "epoch": 1.9704590379969251, + "grad_norm": 2.3348441123962402, + "learning_rate": 1e-06, + "loss": 0.8078, + "mean_token_accuracy": 0.7427046895027161, + "num_tokens": 454395231.0, + "step": 17943 + }, + { + "epoch": 1.9705688556995389, + "grad_norm": 2.33368182182312, + "learning_rate": 1e-06, + "loss": 0.8771, + "mean_token_accuracy": 0.7263003587722778, + "num_tokens": 454419828.0, + "step": 17944 + }, + { + "epoch": 1.9706786734021524, + "grad_norm": 2.1271495819091797, + "learning_rate": 1e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.7241972088813782, + "num_tokens": 454448923.0, + "step": 17945 + }, + { + "epoch": 1.970788491104766, + "grad_norm": 2.4598581790924072, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7166025638580322, + "num_tokens": 454471773.0, + "step": 17946 + }, + { + "epoch": 1.9708983088073797, + "grad_norm": 2.1298668384552, + "learning_rate": 1e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.7080032825469971, + "num_tokens": 454500291.0, + "step": 17947 + }, + { + "epoch": 1.9710081265099935, + "grad_norm": 2.2287895679473877, + "learning_rate": 1e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.7198607921600342, + "num_tokens": 454526024.0, + "step": 17948 + }, + { + "epoch": 1.9711179442126072, + "grad_norm": 1.9644098281860352, + "learning_rate": 1e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.7053219079971313, + "num_tokens": 454557723.0, + "step": 17949 + }, + { + "epoch": 1.9712277619152208, + "grad_norm": 2.148765802383423, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.7102546095848083, + "num_tokens": 454584404.0, + "step": 17950 + }, + { + "epoch": 1.9713375796178343, + "grad_norm": 2.4217989444732666, + "learning_rate": 1e-06, + "loss": 0.8429, + "mean_token_accuracy": 0.7396150827407837, + "num_tokens": 454606004.0, + "step": 17951 + }, + { + "epoch": 1.971447397320448, + "grad_norm": 2.221637010574341, + "learning_rate": 1e-06, + "loss": 0.904, + "mean_token_accuracy": 0.7202267646789551, + "num_tokens": 454631132.0, + "step": 17952 + }, + { + "epoch": 1.9715572150230618, + "grad_norm": 2.1477935314178467, + "learning_rate": 1e-06, + "loss": 0.9087, + "mean_token_accuracy": 0.7178798913955688, + "num_tokens": 454658610.0, + "step": 17953 + }, + { + "epoch": 1.9716670327256753, + "grad_norm": 2.009791374206543, + "learning_rate": 1e-06, + "loss": 0.8997, + "mean_token_accuracy": 0.7217437624931335, + "num_tokens": 454688349.0, + "step": 17954 + }, + { + "epoch": 1.9717768504282889, + "grad_norm": 2.2476449012756348, + "learning_rate": 1e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.7233781814575195, + "num_tokens": 454714047.0, + "step": 17955 + }, + { + "epoch": 1.9718866681309026, + "grad_norm": 2.1297831535339355, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7165829539299011, + "num_tokens": 454743438.0, + "step": 17956 + }, + { + "epoch": 1.9719964858335164, + "grad_norm": 2.1399824619293213, + "learning_rate": 1e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.7103092670440674, + "num_tokens": 454768920.0, + "step": 17957 + }, + { + "epoch": 1.9721063035361301, + "grad_norm": 2.1596741676330566, + "learning_rate": 1e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.7053171396255493, + "num_tokens": 454795790.0, + "step": 17958 + }, + { + "epoch": 1.9722161212387437, + "grad_norm": 2.2979462146759033, + "learning_rate": 1e-06, + "loss": 0.8962, + "mean_token_accuracy": 0.7184615731239319, + "num_tokens": 454821354.0, + "step": 17959 + }, + { + "epoch": 1.9723259389413572, + "grad_norm": 2.4168713092803955, + "learning_rate": 1e-06, + "loss": 0.9933, + "mean_token_accuracy": 0.6955548524856567, + "num_tokens": 454846355.0, + "step": 17960 + }, + { + "epoch": 1.972435756643971, + "grad_norm": 2.2002830505371094, + "learning_rate": 1e-06, + "loss": 0.8675, + "mean_token_accuracy": 0.7244458794593811, + "num_tokens": 454873335.0, + "step": 17961 + }, + { + "epoch": 1.9725455743465847, + "grad_norm": 2.124613046646118, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7119718790054321, + "num_tokens": 454902097.0, + "step": 17962 + }, + { + "epoch": 1.9726553920491985, + "grad_norm": 2.1366991996765137, + "learning_rate": 1e-06, + "loss": 0.8592, + "mean_token_accuracy": 0.7277083396911621, + "num_tokens": 454928873.0, + "step": 17963 + }, + { + "epoch": 1.972765209751812, + "grad_norm": 2.194275379180908, + "learning_rate": 1e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7133792638778687, + "num_tokens": 454953253.0, + "step": 17964 + }, + { + "epoch": 1.9728750274544256, + "grad_norm": 2.210188865661621, + "learning_rate": 1e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7133638262748718, + "num_tokens": 454979708.0, + "step": 17965 + }, + { + "epoch": 1.9729848451570393, + "grad_norm": 2.0859062671661377, + "learning_rate": 1e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.707379937171936, + "num_tokens": 455007874.0, + "step": 17966 + }, + { + "epoch": 1.973094662859653, + "grad_norm": 2.4701380729675293, + "learning_rate": 1e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.7155542373657227, + "num_tokens": 455029861.0, + "step": 17967 + }, + { + "epoch": 1.9732044805622666, + "grad_norm": 1.9466259479522705, + "learning_rate": 1e-06, + "loss": 0.9041, + "mean_token_accuracy": 0.720431923866272, + "num_tokens": 455061114.0, + "step": 17968 + }, + { + "epoch": 1.9733142982648801, + "grad_norm": 2.316906213760376, + "learning_rate": 1e-06, + "loss": 0.849, + "mean_token_accuracy": 0.7297840714454651, + "num_tokens": 455085253.0, + "step": 17969 + }, + { + "epoch": 1.973424115967494, + "grad_norm": 2.530233144760132, + "learning_rate": 1e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.7181674242019653, + "num_tokens": 455106408.0, + "step": 17970 + }, + { + "epoch": 1.9735339336701077, + "grad_norm": 2.1898624897003174, + "learning_rate": 1e-06, + "loss": 0.8359, + "mean_token_accuracy": 0.740263044834137, + "num_tokens": 455133500.0, + "step": 17971 + }, + { + "epoch": 1.9736437513727214, + "grad_norm": 2.3365795612335205, + "learning_rate": 1e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.7213049530982971, + "num_tokens": 455157733.0, + "step": 17972 + }, + { + "epoch": 1.973753569075335, + "grad_norm": 2.470783233642578, + "learning_rate": 1e-06, + "loss": 0.8861, + "mean_token_accuracy": 0.7220531702041626, + "num_tokens": 455178231.0, + "step": 17973 + }, + { + "epoch": 1.9738633867779485, + "grad_norm": 2.0217385292053223, + "learning_rate": 1e-06, + "loss": 0.8341, + "mean_token_accuracy": 0.7437678575515747, + "num_tokens": 455205332.0, + "step": 17974 + }, + { + "epoch": 1.9739732044805622, + "grad_norm": 2.3655993938446045, + "learning_rate": 1e-06, + "loss": 0.8052, + "mean_token_accuracy": 0.7461638450622559, + "num_tokens": 455228423.0, + "step": 17975 + }, + { + "epoch": 1.974083022183176, + "grad_norm": 2.217740297317505, + "learning_rate": 1e-06, + "loss": 0.9128, + "mean_token_accuracy": 0.719275712966919, + "num_tokens": 455253593.0, + "step": 17976 + }, + { + "epoch": 1.9741928398857898, + "grad_norm": 2.506603956222534, + "learning_rate": 1e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.7102742195129395, + "num_tokens": 455275456.0, + "step": 17977 + }, + { + "epoch": 1.9743026575884033, + "grad_norm": 2.4194724559783936, + "learning_rate": 1e-06, + "loss": 0.8247, + "mean_token_accuracy": 0.7392590045928955, + "num_tokens": 455297686.0, + "step": 17978 + }, + { + "epoch": 1.9744124752910168, + "grad_norm": 2.4385199546813965, + "learning_rate": 1e-06, + "loss": 0.7579, + "mean_token_accuracy": 0.7562034130096436, + "num_tokens": 455318392.0, + "step": 17979 + }, + { + "epoch": 1.9745222929936306, + "grad_norm": 2.20039963722229, + "learning_rate": 1e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.7129358649253845, + "num_tokens": 455345001.0, + "step": 17980 + }, + { + "epoch": 1.9746321106962443, + "grad_norm": 2.2402679920196533, + "learning_rate": 1e-06, + "loss": 0.9158, + "mean_token_accuracy": 0.7100754976272583, + "num_tokens": 455370881.0, + "step": 17981 + }, + { + "epoch": 1.9747419283988579, + "grad_norm": 2.438163995742798, + "learning_rate": 1e-06, + "loss": 0.8375, + "mean_token_accuracy": 0.7367284297943115, + "num_tokens": 455392874.0, + "step": 17982 + }, + { + "epoch": 1.9748517461014714, + "grad_norm": 2.355686664581299, + "learning_rate": 1e-06, + "loss": 0.8111, + "mean_token_accuracy": 0.7427747249603271, + "num_tokens": 455414888.0, + "step": 17983 + }, + { + "epoch": 1.9749615638040852, + "grad_norm": 2.2094058990478516, + "learning_rate": 1e-06, + "loss": 0.9869, + "mean_token_accuracy": 0.6936571598052979, + "num_tokens": 455441132.0, + "step": 17984 + }, + { + "epoch": 1.975071381506699, + "grad_norm": 2.226142168045044, + "learning_rate": 1e-06, + "loss": 0.8581, + "mean_token_accuracy": 0.7263951897621155, + "num_tokens": 455467540.0, + "step": 17985 + }, + { + "epoch": 1.9751811992093127, + "grad_norm": 2.3365402221679688, + "learning_rate": 1e-06, + "loss": 0.9159, + "mean_token_accuracy": 0.7177013158798218, + "num_tokens": 455491883.0, + "step": 17986 + }, + { + "epoch": 1.9752910169119262, + "grad_norm": 2.030694007873535, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.6985454559326172, + "num_tokens": 455520487.0, + "step": 17987 + }, + { + "epoch": 1.9754008346145397, + "grad_norm": 2.1076738834381104, + "learning_rate": 1e-06, + "loss": 0.8633, + "mean_token_accuracy": 0.7243398427963257, + "num_tokens": 455546073.0, + "step": 17988 + }, + { + "epoch": 1.9755106523171535, + "grad_norm": 2.397031784057617, + "learning_rate": 1e-06, + "loss": 0.8127, + "mean_token_accuracy": 0.7447298765182495, + "num_tokens": 455566682.0, + "step": 17989 + }, + { + "epoch": 1.9756204700197673, + "grad_norm": 2.5590882301330566, + "learning_rate": 1e-06, + "loss": 0.8246, + "mean_token_accuracy": 0.7345272302627563, + "num_tokens": 455586776.0, + "step": 17990 + }, + { + "epoch": 1.9757302877223808, + "grad_norm": 2.4957103729248047, + "learning_rate": 1e-06, + "loss": 0.8713, + "mean_token_accuracy": 0.7189605236053467, + "num_tokens": 455608304.0, + "step": 17991 + }, + { + "epoch": 1.9758401054249946, + "grad_norm": 2.4482638835906982, + "learning_rate": 1e-06, + "loss": 0.9016, + "mean_token_accuracy": 0.7276355028152466, + "num_tokens": 455630245.0, + "step": 17992 + }, + { + "epoch": 1.975949923127608, + "grad_norm": 3.1137914657592773, + "learning_rate": 1e-06, + "loss": 0.8422, + "mean_token_accuracy": 0.7371528744697571, + "num_tokens": 455650335.0, + "step": 17993 + }, + { + "epoch": 1.9760597408302218, + "grad_norm": 2.273099899291992, + "learning_rate": 1e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.7059149146080017, + "num_tokens": 455676777.0, + "step": 17994 + }, + { + "epoch": 1.9761695585328356, + "grad_norm": 2.4210219383239746, + "learning_rate": 1e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.7124137878417969, + "num_tokens": 455699705.0, + "step": 17995 + }, + { + "epoch": 1.9762793762354491, + "grad_norm": 2.0872645378112793, + "learning_rate": 1e-06, + "loss": 0.8466, + "mean_token_accuracy": 0.7344719767570496, + "num_tokens": 455727731.0, + "step": 17996 + }, + { + "epoch": 1.9763891939380627, + "grad_norm": 2.1446874141693115, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7207821011543274, + "num_tokens": 455753730.0, + "step": 17997 + }, + { + "epoch": 1.9764990116406764, + "grad_norm": 2.114551544189453, + "learning_rate": 1e-06, + "loss": 0.9341, + "mean_token_accuracy": 0.7045724391937256, + "num_tokens": 455782426.0, + "step": 17998 + }, + { + "epoch": 1.9766088293432902, + "grad_norm": 2.134119987487793, + "learning_rate": 1e-06, + "loss": 0.8347, + "mean_token_accuracy": 0.7324244976043701, + "num_tokens": 455808677.0, + "step": 17999 + }, + { + "epoch": 1.976718647045904, + "grad_norm": 2.5575451850891113, + "learning_rate": 1e-06, + "loss": 0.854, + "mean_token_accuracy": 0.7276632189750671, + "num_tokens": 455829069.0, + "step": 18000 + }, + { + "epoch": 1.9768284647485175, + "grad_norm": 2.406097650527954, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7058330178260803, + "num_tokens": 455853189.0, + "step": 18001 + }, + { + "epoch": 1.976938282451131, + "grad_norm": 2.257317304611206, + "learning_rate": 1e-06, + "loss": 0.8761, + "mean_token_accuracy": 0.7310992479324341, + "num_tokens": 455876775.0, + "step": 18002 + }, + { + "epoch": 1.9770481001537448, + "grad_norm": 2.281942844390869, + "learning_rate": 1e-06, + "loss": 0.9498, + "mean_token_accuracy": 0.7135606408119202, + "num_tokens": 455899562.0, + "step": 18003 + }, + { + "epoch": 1.9771579178563585, + "grad_norm": 2.2565107345581055, + "learning_rate": 1e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.7025025486946106, + "num_tokens": 455926347.0, + "step": 18004 + }, + { + "epoch": 1.977267735558972, + "grad_norm": 2.2688324451446533, + "learning_rate": 1e-06, + "loss": 0.8508, + "mean_token_accuracy": 0.733308732509613, + "num_tokens": 455952898.0, + "step": 18005 + }, + { + "epoch": 1.9773775532615858, + "grad_norm": 2.4964888095855713, + "learning_rate": 1e-06, + "loss": 0.8475, + "mean_token_accuracy": 0.7314478158950806, + "num_tokens": 455975364.0, + "step": 18006 + }, + { + "epoch": 1.9774873709641994, + "grad_norm": 2.3415188789367676, + "learning_rate": 1e-06, + "loss": 0.838, + "mean_token_accuracy": 0.7442711591720581, + "num_tokens": 455999583.0, + "step": 18007 + }, + { + "epoch": 1.977597188666813, + "grad_norm": 2.0801780223846436, + "learning_rate": 1e-06, + "loss": 1.0621, + "mean_token_accuracy": 0.6879693269729614, + "num_tokens": 456029399.0, + "step": 18008 + }, + { + "epoch": 1.9777070063694269, + "grad_norm": 2.152880907058716, + "learning_rate": 1e-06, + "loss": 1.0259, + "mean_token_accuracy": 0.6889064908027649, + "num_tokens": 456059297.0, + "step": 18009 + }, + { + "epoch": 1.9778168240720404, + "grad_norm": 2.47400164604187, + "learning_rate": 1e-06, + "loss": 0.9048, + "mean_token_accuracy": 0.7208104133605957, + "num_tokens": 456082110.0, + "step": 18010 + }, + { + "epoch": 1.977926641774654, + "grad_norm": 2.216425895690918, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7137366533279419, + "num_tokens": 456107749.0, + "step": 18011 + }, + { + "epoch": 1.9780364594772677, + "grad_norm": 2.3981330394744873, + "learning_rate": 1e-06, + "loss": 0.8836, + "mean_token_accuracy": 0.7203843593597412, + "num_tokens": 456130716.0, + "step": 18012 + }, + { + "epoch": 1.9781462771798815, + "grad_norm": 2.246852159500122, + "learning_rate": 1e-06, + "loss": 0.8332, + "mean_token_accuracy": 0.7308196425437927, + "num_tokens": 456153787.0, + "step": 18013 + }, + { + "epoch": 1.9782560948824952, + "grad_norm": 2.3566389083862305, + "learning_rate": 1e-06, + "loss": 0.9281, + "mean_token_accuracy": 0.711819052696228, + "num_tokens": 456175333.0, + "step": 18014 + }, + { + "epoch": 1.9783659125851087, + "grad_norm": 2.2974693775177, + "learning_rate": 1e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.7104562520980835, + "num_tokens": 456201367.0, + "step": 18015 + }, + { + "epoch": 1.9784757302877223, + "grad_norm": 2.6267518997192383, + "learning_rate": 1e-06, + "loss": 0.872, + "mean_token_accuracy": 0.7213557958602905, + "num_tokens": 456220347.0, + "step": 18016 + }, + { + "epoch": 1.978585547990336, + "grad_norm": 2.2890968322753906, + "learning_rate": 1e-06, + "loss": 0.9511, + "mean_token_accuracy": 0.7069330811500549, + "num_tokens": 456246859.0, + "step": 18017 + }, + { + "epoch": 1.9786953656929498, + "grad_norm": 2.0252974033355713, + "learning_rate": 1e-06, + "loss": 0.8673, + "mean_token_accuracy": 0.7264057397842407, + "num_tokens": 456274828.0, + "step": 18018 + }, + { + "epoch": 1.9788051833955633, + "grad_norm": 2.137141227722168, + "learning_rate": 1e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.7079293727874756, + "num_tokens": 456302870.0, + "step": 18019 + }, + { + "epoch": 1.9789150010981769, + "grad_norm": 2.4232337474823, + "learning_rate": 1e-06, + "loss": 0.7927, + "mean_token_accuracy": 0.7515381574630737, + "num_tokens": 456324200.0, + "step": 18020 + }, + { + "epoch": 1.9790248188007906, + "grad_norm": 2.024869441986084, + "learning_rate": 1e-06, + "loss": 0.803, + "mean_token_accuracy": 0.7400695085525513, + "num_tokens": 456352091.0, + "step": 18021 + }, + { + "epoch": 1.9791346365034044, + "grad_norm": 2.1702046394348145, + "learning_rate": 1e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7121290564537048, + "num_tokens": 456379296.0, + "step": 18022 + }, + { + "epoch": 1.9792444542060181, + "grad_norm": 2.2095582485198975, + "learning_rate": 1e-06, + "loss": 0.8883, + "mean_token_accuracy": 0.724131166934967, + "num_tokens": 456403769.0, + "step": 18023 + }, + { + "epoch": 1.9793542719086317, + "grad_norm": 2.657716989517212, + "learning_rate": 1e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.7191093564033508, + "num_tokens": 456423528.0, + "step": 18024 + }, + { + "epoch": 1.9794640896112452, + "grad_norm": 2.200798273086548, + "learning_rate": 1e-06, + "loss": 0.8725, + "mean_token_accuracy": 0.7213181257247925, + "num_tokens": 456449120.0, + "step": 18025 + }, + { + "epoch": 1.979573907313859, + "grad_norm": 2.288776159286499, + "learning_rate": 1e-06, + "loss": 0.9873, + "mean_token_accuracy": 0.6977561712265015, + "num_tokens": 456473921.0, + "step": 18026 + }, + { + "epoch": 1.9796837250164727, + "grad_norm": 2.414541244506836, + "learning_rate": 1e-06, + "loss": 0.8526, + "mean_token_accuracy": 0.730868935585022, + "num_tokens": 456496583.0, + "step": 18027 + }, + { + "epoch": 1.9797935427190865, + "grad_norm": 2.1910958290100098, + "learning_rate": 1e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7216393947601318, + "num_tokens": 456523029.0, + "step": 18028 + }, + { + "epoch": 1.9799033604217, + "grad_norm": 2.39186954498291, + "learning_rate": 1e-06, + "loss": 0.7698, + "mean_token_accuracy": 0.7579112648963928, + "num_tokens": 456545014.0, + "step": 18029 + }, + { + "epoch": 1.9800131781243135, + "grad_norm": 2.1750917434692383, + "learning_rate": 1e-06, + "loss": 0.769, + "mean_token_accuracy": 0.7495084404945374, + "num_tokens": 456571026.0, + "step": 18030 + }, + { + "epoch": 1.9801229958269273, + "grad_norm": 2.6228091716766357, + "learning_rate": 1e-06, + "loss": 0.8509, + "mean_token_accuracy": 0.7379946708679199, + "num_tokens": 456592397.0, + "step": 18031 + }, + { + "epoch": 1.980232813529541, + "grad_norm": 2.232752799987793, + "learning_rate": 1e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.7197977900505066, + "num_tokens": 456617699.0, + "step": 18032 + }, + { + "epoch": 1.9803426312321546, + "grad_norm": 2.3419206142425537, + "learning_rate": 1e-06, + "loss": 0.8744, + "mean_token_accuracy": 0.7259877324104309, + "num_tokens": 456640620.0, + "step": 18033 + }, + { + "epoch": 1.9804524489347681, + "grad_norm": 2.4338934421539307, + "learning_rate": 1e-06, + "loss": 0.8538, + "mean_token_accuracy": 0.7378523349761963, + "num_tokens": 456663720.0, + "step": 18034 + }, + { + "epoch": 1.9805622666373819, + "grad_norm": 2.456963062286377, + "learning_rate": 1e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.7060231566429138, + "num_tokens": 456685490.0, + "step": 18035 + }, + { + "epoch": 1.9806720843399956, + "grad_norm": 2.319549798965454, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7040033340454102, + "num_tokens": 456712245.0, + "step": 18036 + }, + { + "epoch": 1.9807819020426094, + "grad_norm": 2.3528645038604736, + "learning_rate": 1e-06, + "loss": 0.9086, + "mean_token_accuracy": 0.7179173231124878, + "num_tokens": 456736597.0, + "step": 18037 + }, + { + "epoch": 1.980891719745223, + "grad_norm": 2.355807304382324, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7099459171295166, + "num_tokens": 456761612.0, + "step": 18038 + }, + { + "epoch": 1.9810015374478365, + "grad_norm": 2.288811445236206, + "learning_rate": 1e-06, + "loss": 0.8908, + "mean_token_accuracy": 0.7209459543228149, + "num_tokens": 456786426.0, + "step": 18039 + }, + { + "epoch": 1.9811113551504502, + "grad_norm": 2.312270164489746, + "learning_rate": 1e-06, + "loss": 0.893, + "mean_token_accuracy": 0.7203941345214844, + "num_tokens": 456810097.0, + "step": 18040 + }, + { + "epoch": 1.981221172853064, + "grad_norm": 2.1196062564849854, + "learning_rate": 1e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.7223851680755615, + "num_tokens": 456838817.0, + "step": 18041 + }, + { + "epoch": 1.9813309905556777, + "grad_norm": 2.3812036514282227, + "learning_rate": 1e-06, + "loss": 0.7971, + "mean_token_accuracy": 0.745924711227417, + "num_tokens": 456860995.0, + "step": 18042 + }, + { + "epoch": 1.9814408082582913, + "grad_norm": 2.3040924072265625, + "learning_rate": 1e-06, + "loss": 0.8127, + "mean_token_accuracy": 0.7392604351043701, + "num_tokens": 456885624.0, + "step": 18043 + }, + { + "epoch": 1.9815506259609048, + "grad_norm": 2.6112000942230225, + "learning_rate": 1e-06, + "loss": 0.9268, + "mean_token_accuracy": 0.7139629125595093, + "num_tokens": 456906696.0, + "step": 18044 + }, + { + "epoch": 1.9816604436635186, + "grad_norm": 2.261575698852539, + "learning_rate": 1e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.7142236232757568, + "num_tokens": 456932084.0, + "step": 18045 + }, + { + "epoch": 1.9817702613661323, + "grad_norm": 2.1407053470611572, + "learning_rate": 1e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.7123937606811523, + "num_tokens": 456959041.0, + "step": 18046 + }, + { + "epoch": 1.9818800790687459, + "grad_norm": 2.1499998569488525, + "learning_rate": 1e-06, + "loss": 0.8563, + "mean_token_accuracy": 0.7322707772254944, + "num_tokens": 456985376.0, + "step": 18047 + }, + { + "epoch": 1.9819898967713594, + "grad_norm": 2.0785927772521973, + "learning_rate": 1e-06, + "loss": 0.8827, + "mean_token_accuracy": 0.729678750038147, + "num_tokens": 457012245.0, + "step": 18048 + }, + { + "epoch": 1.9820997144739732, + "grad_norm": 2.047003984451294, + "learning_rate": 1e-06, + "loss": 1.0047, + "mean_token_accuracy": 0.6947612762451172, + "num_tokens": 457043138.0, + "step": 18049 + }, + { + "epoch": 1.982209532176587, + "grad_norm": 2.3804545402526855, + "learning_rate": 1e-06, + "loss": 0.8254, + "mean_token_accuracy": 0.7373071908950806, + "num_tokens": 457066036.0, + "step": 18050 + }, + { + "epoch": 1.9823193498792007, + "grad_norm": 2.0084469318389893, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7112427949905396, + "num_tokens": 457096980.0, + "step": 18051 + }, + { + "epoch": 1.9824291675818142, + "grad_norm": 2.4672253131866455, + "learning_rate": 1e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.7154451608657837, + "num_tokens": 457119849.0, + "step": 18052 + }, + { + "epoch": 1.9825389852844277, + "grad_norm": 2.402203321456909, + "learning_rate": 1e-06, + "loss": 0.8577, + "mean_token_accuracy": 0.7299642562866211, + "num_tokens": 457142290.0, + "step": 18053 + }, + { + "epoch": 1.9826488029870415, + "grad_norm": 2.2593300342559814, + "learning_rate": 1e-06, + "loss": 0.8362, + "mean_token_accuracy": 0.7355893850326538, + "num_tokens": 457166372.0, + "step": 18054 + }, + { + "epoch": 1.9827586206896552, + "grad_norm": 2.0792646408081055, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7064290642738342, + "num_tokens": 457195468.0, + "step": 18055 + }, + { + "epoch": 1.9828684383922688, + "grad_norm": 2.1790597438812256, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7206108570098877, + "num_tokens": 457221814.0, + "step": 18056 + }, + { + "epoch": 1.9829782560948825, + "grad_norm": 2.3446285724639893, + "learning_rate": 1e-06, + "loss": 0.9668, + "mean_token_accuracy": 0.7066999673843384, + "num_tokens": 457246613.0, + "step": 18057 + }, + { + "epoch": 1.983088073797496, + "grad_norm": 2.1869139671325684, + "learning_rate": 1e-06, + "loss": 0.8452, + "mean_token_accuracy": 0.7366967797279358, + "num_tokens": 457272307.0, + "step": 18058 + }, + { + "epoch": 1.9831978915001098, + "grad_norm": 2.2226979732513428, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7070187926292419, + "num_tokens": 457299966.0, + "step": 18059 + }, + { + "epoch": 1.9833077092027236, + "grad_norm": 2.426170825958252, + "learning_rate": 1e-06, + "loss": 0.8936, + "mean_token_accuracy": 0.7167163491249084, + "num_tokens": 457322026.0, + "step": 18060 + }, + { + "epoch": 1.9834175269053371, + "grad_norm": 1.9648760557174683, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7146538496017456, + "num_tokens": 457355281.0, + "step": 18061 + }, + { + "epoch": 1.9835273446079507, + "grad_norm": 2.022907257080078, + "learning_rate": 1e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.7219041585922241, + "num_tokens": 457387198.0, + "step": 18062 + }, + { + "epoch": 1.9836371623105644, + "grad_norm": 2.533094644546509, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.708210825920105, + "num_tokens": 457409784.0, + "step": 18063 + }, + { + "epoch": 1.9837469800131782, + "grad_norm": 2.185987949371338, + "learning_rate": 1e-06, + "loss": 0.8, + "mean_token_accuracy": 0.7431503534317017, + "num_tokens": 457435202.0, + "step": 18064 + }, + { + "epoch": 1.983856797715792, + "grad_norm": 2.1445133686065674, + "learning_rate": 1e-06, + "loss": 0.8899, + "mean_token_accuracy": 0.7209274768829346, + "num_tokens": 457461807.0, + "step": 18065 + }, + { + "epoch": 1.9839666154184055, + "grad_norm": 2.1968793869018555, + "learning_rate": 1e-06, + "loss": 0.8516, + "mean_token_accuracy": 0.7281776666641235, + "num_tokens": 457486361.0, + "step": 18066 + }, + { + "epoch": 1.984076433121019, + "grad_norm": 2.2267911434173584, + "learning_rate": 1e-06, + "loss": 0.8777, + "mean_token_accuracy": 0.7227075695991516, + "num_tokens": 457510763.0, + "step": 18067 + }, + { + "epoch": 1.9841862508236328, + "grad_norm": 2.1566126346588135, + "learning_rate": 1e-06, + "loss": 0.7607, + "mean_token_accuracy": 0.7639495730400085, + "num_tokens": 457533855.0, + "step": 18068 + }, + { + "epoch": 1.9842960685262465, + "grad_norm": 2.7091970443725586, + "learning_rate": 1e-06, + "loss": 0.8655, + "mean_token_accuracy": 0.7342047691345215, + "num_tokens": 457554166.0, + "step": 18069 + }, + { + "epoch": 1.98440588622886, + "grad_norm": 2.1987454891204834, + "learning_rate": 1e-06, + "loss": 0.8307, + "mean_token_accuracy": 0.7399516105651855, + "num_tokens": 457578776.0, + "step": 18070 + }, + { + "epoch": 1.9845157039314738, + "grad_norm": 2.3322627544403076, + "learning_rate": 1e-06, + "loss": 1.0111, + "mean_token_accuracy": 0.7044559121131897, + "num_tokens": 457604351.0, + "step": 18071 + }, + { + "epoch": 1.9846255216340873, + "grad_norm": 1.8826895952224731, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7007033824920654, + "num_tokens": 457638976.0, + "step": 18072 + }, + { + "epoch": 1.984735339336701, + "grad_norm": 2.4857981204986572, + "learning_rate": 1e-06, + "loss": 0.8915, + "mean_token_accuracy": 0.7232332825660706, + "num_tokens": 457662208.0, + "step": 18073 + }, + { + "epoch": 1.9848451570393149, + "grad_norm": 2.136704206466675, + "learning_rate": 1e-06, + "loss": 0.9037, + "mean_token_accuracy": 0.7209655046463013, + "num_tokens": 457689540.0, + "step": 18074 + }, + { + "epoch": 1.9849549747419284, + "grad_norm": 2.610595226287842, + "learning_rate": 1e-06, + "loss": 0.848, + "mean_token_accuracy": 0.7307956218719482, + "num_tokens": 457708729.0, + "step": 18075 + }, + { + "epoch": 1.985064792444542, + "grad_norm": 2.8535866737365723, + "learning_rate": 1e-06, + "loss": 0.77, + "mean_token_accuracy": 0.7608343362808228, + "num_tokens": 457725034.0, + "step": 18076 + }, + { + "epoch": 1.9851746101471557, + "grad_norm": 2.243241786956787, + "learning_rate": 1e-06, + "loss": 0.8406, + "mean_token_accuracy": 0.7327423095703125, + "num_tokens": 457750099.0, + "step": 18077 + }, + { + "epoch": 1.9852844278497694, + "grad_norm": 2.344374418258667, + "learning_rate": 1e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.7204627394676208, + "num_tokens": 457774634.0, + "step": 18078 + }, + { + "epoch": 1.9853942455523832, + "grad_norm": 2.179969549179077, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7188825607299805, + "num_tokens": 457802152.0, + "step": 18079 + }, + { + "epoch": 1.9855040632549967, + "grad_norm": 2.301208972930908, + "learning_rate": 1e-06, + "loss": 0.8124, + "mean_token_accuracy": 0.7408660054206848, + "num_tokens": 457825829.0, + "step": 18080 + }, + { + "epoch": 1.9856138809576103, + "grad_norm": 2.3271334171295166, + "learning_rate": 1e-06, + "loss": 0.9041, + "mean_token_accuracy": 0.7155094146728516, + "num_tokens": 457853348.0, + "step": 18081 + }, + { + "epoch": 1.985723698660224, + "grad_norm": 2.4960789680480957, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7095621824264526, + "num_tokens": 457876629.0, + "step": 18082 + }, + { + "epoch": 1.9858335163628378, + "grad_norm": 2.093662738800049, + "learning_rate": 1e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7123426198959351, + "num_tokens": 457906890.0, + "step": 18083 + }, + { + "epoch": 1.9859433340654513, + "grad_norm": 2.2434403896331787, + "learning_rate": 1e-06, + "loss": 0.8346, + "mean_token_accuracy": 0.7383499145507812, + "num_tokens": 457930959.0, + "step": 18084 + }, + { + "epoch": 1.9860531517680649, + "grad_norm": 2.070665121078491, + "learning_rate": 1e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.7083713412284851, + "num_tokens": 457960172.0, + "step": 18085 + }, + { + "epoch": 1.9861629694706786, + "grad_norm": 2.25840163230896, + "learning_rate": 1e-06, + "loss": 0.8557, + "mean_token_accuracy": 0.7367335557937622, + "num_tokens": 457984269.0, + "step": 18086 + }, + { + "epoch": 1.9862727871732924, + "grad_norm": 2.367013692855835, + "learning_rate": 1e-06, + "loss": 0.801, + "mean_token_accuracy": 0.7473419904708862, + "num_tokens": 458007890.0, + "step": 18087 + }, + { + "epoch": 1.9863826048759061, + "grad_norm": 2.2439095973968506, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.6918227672576904, + "num_tokens": 458033215.0, + "step": 18088 + }, + { + "epoch": 1.9864924225785197, + "grad_norm": 2.345647096633911, + "learning_rate": 1e-06, + "loss": 0.8538, + "mean_token_accuracy": 0.7300243973731995, + "num_tokens": 458056228.0, + "step": 18089 + }, + { + "epoch": 1.9866022402811332, + "grad_norm": 2.460068941116333, + "learning_rate": 1e-06, + "loss": 0.8826, + "mean_token_accuracy": 0.7231736183166504, + "num_tokens": 458078120.0, + "step": 18090 + }, + { + "epoch": 1.986712057983747, + "grad_norm": 1.91574227809906, + "learning_rate": 1e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.6936014890670776, + "num_tokens": 458110295.0, + "step": 18091 + }, + { + "epoch": 1.9868218756863607, + "grad_norm": 2.2480263710021973, + "learning_rate": 1e-06, + "loss": 0.8413, + "mean_token_accuracy": 0.7402597069740295, + "num_tokens": 458134731.0, + "step": 18092 + }, + { + "epoch": 1.9869316933889745, + "grad_norm": 2.211379051208496, + "learning_rate": 1e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.7232184410095215, + "num_tokens": 458161686.0, + "step": 18093 + }, + { + "epoch": 1.987041511091588, + "grad_norm": 2.0026979446411133, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7136653661727905, + "num_tokens": 458193065.0, + "step": 18094 + }, + { + "epoch": 1.9871513287942015, + "grad_norm": 2.309851884841919, + "learning_rate": 1e-06, + "loss": 0.9087, + "mean_token_accuracy": 0.7181307077407837, + "num_tokens": 458218069.0, + "step": 18095 + }, + { + "epoch": 1.9872611464968153, + "grad_norm": 2.424772024154663, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.712736964225769, + "num_tokens": 458240406.0, + "step": 18096 + }, + { + "epoch": 1.987370964199429, + "grad_norm": 2.448202133178711, + "learning_rate": 1e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7258514761924744, + "num_tokens": 458263644.0, + "step": 18097 + }, + { + "epoch": 1.9874807819020426, + "grad_norm": 2.420548439025879, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.707284688949585, + "num_tokens": 458285754.0, + "step": 18098 + }, + { + "epoch": 1.9875905996046561, + "grad_norm": 2.2036538124084473, + "learning_rate": 1e-06, + "loss": 0.8625, + "mean_token_accuracy": 0.7235907316207886, + "num_tokens": 458310408.0, + "step": 18099 + }, + { + "epoch": 1.9877004173072699, + "grad_norm": 2.237980365753174, + "learning_rate": 1e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7147021293640137, + "num_tokens": 458337966.0, + "step": 18100 + }, + { + "epoch": 1.9878102350098836, + "grad_norm": 2.1226587295532227, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7180615067481995, + "num_tokens": 458366672.0, + "step": 18101 + }, + { + "epoch": 1.9879200527124974, + "grad_norm": 2.3370718955993652, + "learning_rate": 1e-06, + "loss": 0.7154, + "mean_token_accuracy": 0.7618944644927979, + "num_tokens": 458388395.0, + "step": 18102 + }, + { + "epoch": 1.988029870415111, + "grad_norm": 2.0740365982055664, + "learning_rate": 1e-06, + "loss": 0.9409, + "mean_token_accuracy": 0.7169992327690125, + "num_tokens": 458418753.0, + "step": 18103 + }, + { + "epoch": 1.9881396881177245, + "grad_norm": 2.241976022720337, + "learning_rate": 1e-06, + "loss": 0.8301, + "mean_token_accuracy": 0.7366694211959839, + "num_tokens": 458443445.0, + "step": 18104 + }, + { + "epoch": 1.9882495058203382, + "grad_norm": 2.238663673400879, + "learning_rate": 1e-06, + "loss": 0.7786, + "mean_token_accuracy": 0.757200300693512, + "num_tokens": 458468084.0, + "step": 18105 + }, + { + "epoch": 1.988359323522952, + "grad_norm": 2.3113958835601807, + "learning_rate": 1e-06, + "loss": 0.844, + "mean_token_accuracy": 0.7352926135063171, + "num_tokens": 458492090.0, + "step": 18106 + }, + { + "epoch": 1.9884691412255655, + "grad_norm": 2.5940656661987305, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7083725929260254, + "num_tokens": 458513468.0, + "step": 18107 + }, + { + "epoch": 1.9885789589281793, + "grad_norm": 2.194697380065918, + "learning_rate": 1e-06, + "loss": 0.852, + "mean_token_accuracy": 0.7339984178543091, + "num_tokens": 458538387.0, + "step": 18108 + }, + { + "epoch": 1.9886887766307928, + "grad_norm": 2.7118546962738037, + "learning_rate": 1e-06, + "loss": 0.8167, + "mean_token_accuracy": 0.7427871823310852, + "num_tokens": 458554599.0, + "step": 18109 + }, + { + "epoch": 1.9887985943334066, + "grad_norm": 2.2182605266571045, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7234984636306763, + "num_tokens": 458585952.0, + "step": 18110 + }, + { + "epoch": 1.9889084120360203, + "grad_norm": 2.7632064819335938, + "learning_rate": 1e-06, + "loss": 0.7522, + "mean_token_accuracy": 0.754474401473999, + "num_tokens": 458605832.0, + "step": 18111 + }, + { + "epoch": 1.9890182297386338, + "grad_norm": 2.1576900482177734, + "learning_rate": 1e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.7050902247428894, + "num_tokens": 458632054.0, + "step": 18112 + }, + { + "epoch": 1.9891280474412474, + "grad_norm": 2.0176892280578613, + "learning_rate": 1e-06, + "loss": 0.8827, + "mean_token_accuracy": 0.7246953845024109, + "num_tokens": 458661512.0, + "step": 18113 + }, + { + "epoch": 1.9892378651438611, + "grad_norm": 2.2849209308624268, + "learning_rate": 1e-06, + "loss": 0.9133, + "mean_token_accuracy": 0.7108404636383057, + "num_tokens": 458684489.0, + "step": 18114 + }, + { + "epoch": 1.989347682846475, + "grad_norm": 2.5543980598449707, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7218391299247742, + "num_tokens": 458704916.0, + "step": 18115 + }, + { + "epoch": 1.9894575005490887, + "grad_norm": 2.4771969318389893, + "learning_rate": 1e-06, + "loss": 0.8656, + "mean_token_accuracy": 0.7246513366699219, + "num_tokens": 458727068.0, + "step": 18116 + }, + { + "epoch": 1.9895673182517022, + "grad_norm": 1.8965495824813843, + "learning_rate": 1e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.7025708556175232, + "num_tokens": 458761924.0, + "step": 18117 + }, + { + "epoch": 1.9896771359543157, + "grad_norm": 2.686903238296509, + "learning_rate": 1e-06, + "loss": 0.8327, + "mean_token_accuracy": 0.732516884803772, + "num_tokens": 458780282.0, + "step": 18118 + }, + { + "epoch": 1.9897869536569295, + "grad_norm": 2.2132322788238525, + "learning_rate": 1e-06, + "loss": 0.8289, + "mean_token_accuracy": 0.737859845161438, + "num_tokens": 458805548.0, + "step": 18119 + }, + { + "epoch": 1.9898967713595432, + "grad_norm": 2.016876220703125, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.70143723487854, + "num_tokens": 458835314.0, + "step": 18120 + }, + { + "epoch": 1.9900065890621568, + "grad_norm": 2.413991689682007, + "learning_rate": 1e-06, + "loss": 0.8584, + "mean_token_accuracy": 0.7306346893310547, + "num_tokens": 458855177.0, + "step": 18121 + }, + { + "epoch": 1.9901164067647705, + "grad_norm": 2.161045551300049, + "learning_rate": 1e-06, + "loss": 0.9138, + "mean_token_accuracy": 0.7169395089149475, + "num_tokens": 458882024.0, + "step": 18122 + }, + { + "epoch": 1.990226224467384, + "grad_norm": 2.199885368347168, + "learning_rate": 1e-06, + "loss": 0.8517, + "mean_token_accuracy": 0.7344073057174683, + "num_tokens": 458908377.0, + "step": 18123 + }, + { + "epoch": 1.9903360421699978, + "grad_norm": 1.9417908191680908, + "learning_rate": 1e-06, + "loss": 0.9012, + "mean_token_accuracy": 0.7235127687454224, + "num_tokens": 458939280.0, + "step": 18124 + }, + { + "epoch": 1.9904458598726116, + "grad_norm": 2.4367263317108154, + "learning_rate": 1e-06, + "loss": 0.8362, + "mean_token_accuracy": 0.7372879385948181, + "num_tokens": 458961282.0, + "step": 18125 + }, + { + "epoch": 1.9905556775752251, + "grad_norm": 2.2594528198242188, + "learning_rate": 1e-06, + "loss": 0.8153, + "mean_token_accuracy": 0.7436838746070862, + "num_tokens": 458984663.0, + "step": 18126 + }, + { + "epoch": 1.9906654952778386, + "grad_norm": 2.2060506343841553, + "learning_rate": 1e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.712729275226593, + "num_tokens": 459010778.0, + "step": 18127 + }, + { + "epoch": 1.9907753129804524, + "grad_norm": 2.312929153442383, + "learning_rate": 1e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.7155821323394775, + "num_tokens": 459034613.0, + "step": 18128 + }, + { + "epoch": 1.9908851306830662, + "grad_norm": 2.151904821395874, + "learning_rate": 1e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.714850664138794, + "num_tokens": 459063636.0, + "step": 18129 + }, + { + "epoch": 1.99099494838568, + "grad_norm": 2.3223838806152344, + "learning_rate": 1e-06, + "loss": 0.9047, + "mean_token_accuracy": 0.7129002809524536, + "num_tokens": 459088399.0, + "step": 18130 + }, + { + "epoch": 1.9911047660882935, + "grad_norm": 2.368633508682251, + "learning_rate": 1e-06, + "loss": 0.8791, + "mean_token_accuracy": 0.725519061088562, + "num_tokens": 459111131.0, + "step": 18131 + }, + { + "epoch": 1.991214583790907, + "grad_norm": 2.2347755432128906, + "learning_rate": 1e-06, + "loss": 0.9158, + "mean_token_accuracy": 0.7122100591659546, + "num_tokens": 459136712.0, + "step": 18132 + }, + { + "epoch": 1.9913244014935207, + "grad_norm": 2.7721023559570312, + "learning_rate": 1e-06, + "loss": 0.7248, + "mean_token_accuracy": 0.7626711130142212, + "num_tokens": 459153129.0, + "step": 18133 + }, + { + "epoch": 1.9914342191961345, + "grad_norm": 2.164064407348633, + "learning_rate": 1e-06, + "loss": 0.8068, + "mean_token_accuracy": 0.7382477521896362, + "num_tokens": 459178161.0, + "step": 18134 + }, + { + "epoch": 1.991544036898748, + "grad_norm": 2.3306307792663574, + "learning_rate": 1e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.7071369886398315, + "num_tokens": 459201661.0, + "step": 18135 + }, + { + "epoch": 1.9916538546013618, + "grad_norm": 2.6027514934539795, + "learning_rate": 1e-06, + "loss": 0.7766, + "mean_token_accuracy": 0.7517682313919067, + "num_tokens": 459223435.0, + "step": 18136 + }, + { + "epoch": 1.9917636723039753, + "grad_norm": 2.3399574756622314, + "learning_rate": 1e-06, + "loss": 0.9924, + "mean_token_accuracy": 0.6913491487503052, + "num_tokens": 459248757.0, + "step": 18137 + }, + { + "epoch": 1.991873490006589, + "grad_norm": 2.2868387699127197, + "learning_rate": 1e-06, + "loss": 0.8361, + "mean_token_accuracy": 0.739238440990448, + "num_tokens": 459271466.0, + "step": 18138 + }, + { + "epoch": 1.9919833077092028, + "grad_norm": 2.099369764328003, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.709979772567749, + "num_tokens": 459300357.0, + "step": 18139 + }, + { + "epoch": 1.9920931254118164, + "grad_norm": 2.1301119327545166, + "learning_rate": 1e-06, + "loss": 0.934, + "mean_token_accuracy": 0.7118051052093506, + "num_tokens": 459329394.0, + "step": 18140 + }, + { + "epoch": 1.99220294311443, + "grad_norm": 1.9462761878967285, + "learning_rate": 1e-06, + "loss": 1.0073, + "mean_token_accuracy": 0.7009494304656982, + "num_tokens": 459361805.0, + "step": 18141 + }, + { + "epoch": 1.9923127608170437, + "grad_norm": 2.0685677528381348, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.6918226480484009, + "num_tokens": 459390897.0, + "step": 18142 + }, + { + "epoch": 1.9924225785196574, + "grad_norm": 2.364248037338257, + "learning_rate": 1e-06, + "loss": 0.8416, + "mean_token_accuracy": 0.7383687496185303, + "num_tokens": 459413867.0, + "step": 18143 + }, + { + "epoch": 1.9925323962222712, + "grad_norm": 2.2001864910125732, + "learning_rate": 1e-06, + "loss": 0.8823, + "mean_token_accuracy": 0.7349699139595032, + "num_tokens": 459438954.0, + "step": 18144 + }, + { + "epoch": 1.9926422139248847, + "grad_norm": 2.1625783443450928, + "learning_rate": 1e-06, + "loss": 0.8592, + "mean_token_accuracy": 0.7297516465187073, + "num_tokens": 459464846.0, + "step": 18145 + }, + { + "epoch": 1.9927520316274983, + "grad_norm": 2.0563838481903076, + "learning_rate": 1e-06, + "loss": 0.9905, + "mean_token_accuracy": 0.7017501592636108, + "num_tokens": 459496889.0, + "step": 18146 + }, + { + "epoch": 1.992861849330112, + "grad_norm": 2.604106903076172, + "learning_rate": 1e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.719004213809967, + "num_tokens": 459517307.0, + "step": 18147 + }, + { + "epoch": 1.9929716670327258, + "grad_norm": 2.304025173187256, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7062183022499084, + "num_tokens": 459541962.0, + "step": 18148 + }, + { + "epoch": 1.9930814847353393, + "grad_norm": 2.1052896976470947, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7055536508560181, + "num_tokens": 459571651.0, + "step": 18149 + }, + { + "epoch": 1.9931913024379528, + "grad_norm": 2.086435079574585, + "learning_rate": 1e-06, + "loss": 0.9457, + "mean_token_accuracy": 0.715603232383728, + "num_tokens": 459600611.0, + "step": 18150 + }, + { + "epoch": 1.9933011201405666, + "grad_norm": 2.1395862102508545, + "learning_rate": 1e-06, + "loss": 0.8469, + "mean_token_accuracy": 0.7381170988082886, + "num_tokens": 459625511.0, + "step": 18151 + }, + { + "epoch": 1.9934109378431804, + "grad_norm": 2.1250765323638916, + "learning_rate": 1e-06, + "loss": 0.8354, + "mean_token_accuracy": 0.7311965227127075, + "num_tokens": 459650387.0, + "step": 18152 + }, + { + "epoch": 1.993520755545794, + "grad_norm": 1.932267189025879, + "learning_rate": 1e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.709639310836792, + "num_tokens": 459682101.0, + "step": 18153 + }, + { + "epoch": 1.9936305732484076, + "grad_norm": 2.0042614936828613, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.6994854211807251, + "num_tokens": 459712822.0, + "step": 18154 + }, + { + "epoch": 1.9937403909510212, + "grad_norm": 2.4241061210632324, + "learning_rate": 1e-06, + "loss": 0.9011, + "mean_token_accuracy": 0.7192996740341187, + "num_tokens": 459736436.0, + "step": 18155 + }, + { + "epoch": 1.993850208653635, + "grad_norm": 2.291053056716919, + "learning_rate": 1e-06, + "loss": 0.7736, + "mean_token_accuracy": 0.7507160902023315, + "num_tokens": 459759133.0, + "step": 18156 + }, + { + "epoch": 1.9939600263562487, + "grad_norm": 2.1878926753997803, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7120851278305054, + "num_tokens": 459788415.0, + "step": 18157 + }, + { + "epoch": 1.9940698440588625, + "grad_norm": 2.5123651027679443, + "learning_rate": 1e-06, + "loss": 0.8482, + "mean_token_accuracy": 0.7317977547645569, + "num_tokens": 459809222.0, + "step": 18158 + }, + { + "epoch": 1.994179661761476, + "grad_norm": 2.1823060512542725, + "learning_rate": 1e-06, + "loss": 0.8305, + "mean_token_accuracy": 0.7393207550048828, + "num_tokens": 459834239.0, + "step": 18159 + }, + { + "epoch": 1.9942894794640895, + "grad_norm": 2.1312544345855713, + "learning_rate": 1e-06, + "loss": 0.9056, + "mean_token_accuracy": 0.7225949764251709, + "num_tokens": 459860679.0, + "step": 18160 + }, + { + "epoch": 1.9943992971667033, + "grad_norm": 2.4673798084259033, + "learning_rate": 1e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.7111461758613586, + "num_tokens": 459883832.0, + "step": 18161 + }, + { + "epoch": 1.994509114869317, + "grad_norm": 2.517430305480957, + "learning_rate": 1e-06, + "loss": 0.7967, + "mean_token_accuracy": 0.7436040639877319, + "num_tokens": 459903616.0, + "step": 18162 + }, + { + "epoch": 1.9946189325719306, + "grad_norm": 2.2690834999084473, + "learning_rate": 1e-06, + "loss": 0.9016, + "mean_token_accuracy": 0.7219172716140747, + "num_tokens": 459928025.0, + "step": 18163 + }, + { + "epoch": 1.994728750274544, + "grad_norm": 2.446610450744629, + "learning_rate": 1e-06, + "loss": 0.893, + "mean_token_accuracy": 0.7316528558731079, + "num_tokens": 459950707.0, + "step": 18164 + }, + { + "epoch": 1.9948385679771579, + "grad_norm": 2.5315470695495605, + "learning_rate": 1e-06, + "loss": 0.8254, + "mean_token_accuracy": 0.7407166957855225, + "num_tokens": 459971726.0, + "step": 18165 + }, + { + "epoch": 1.9949483856797716, + "grad_norm": 2.439807653427124, + "learning_rate": 1e-06, + "loss": 0.9089, + "mean_token_accuracy": 0.7172938585281372, + "num_tokens": 459993695.0, + "step": 18166 + }, + { + "epoch": 1.9950582033823854, + "grad_norm": 2.170734405517578, + "learning_rate": 1e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7195248603820801, + "num_tokens": 460020210.0, + "step": 18167 + }, + { + "epoch": 1.995168021084999, + "grad_norm": 2.148885726928711, + "learning_rate": 1e-06, + "loss": 0.9237, + "mean_token_accuracy": 0.7180330753326416, + "num_tokens": 460047227.0, + "step": 18168 + }, + { + "epoch": 1.9952778387876124, + "grad_norm": 2.2546091079711914, + "learning_rate": 1e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.7052345275878906, + "num_tokens": 460073990.0, + "step": 18169 + }, + { + "epoch": 1.9953876564902262, + "grad_norm": 1.9644420146942139, + "learning_rate": 1e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.7022538781166077, + "num_tokens": 460105767.0, + "step": 18170 + }, + { + "epoch": 1.99549747419284, + "grad_norm": 2.2099192142486572, + "learning_rate": 1e-06, + "loss": 0.8287, + "mean_token_accuracy": 0.7367240190505981, + "num_tokens": 460132392.0, + "step": 18171 + }, + { + "epoch": 1.9956072918954535, + "grad_norm": 2.413126230239868, + "learning_rate": 1e-06, + "loss": 0.8468, + "mean_token_accuracy": 0.7347314357757568, + "num_tokens": 460153469.0, + "step": 18172 + }, + { + "epoch": 1.9957171095980673, + "grad_norm": 2.542492389678955, + "learning_rate": 1e-06, + "loss": 0.8297, + "mean_token_accuracy": 0.7351150512695312, + "num_tokens": 460174786.0, + "step": 18173 + }, + { + "epoch": 1.9958269273006808, + "grad_norm": 2.192237377166748, + "learning_rate": 1e-06, + "loss": 0.9035, + "mean_token_accuracy": 0.7167971730232239, + "num_tokens": 460202263.0, + "step": 18174 + }, + { + "epoch": 1.9959367450032945, + "grad_norm": 2.399444103240967, + "learning_rate": 1e-06, + "loss": 0.9039, + "mean_token_accuracy": 0.7163282036781311, + "num_tokens": 460228177.0, + "step": 18175 + }, + { + "epoch": 1.9960465627059083, + "grad_norm": 2.102424144744873, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7144433259963989, + "num_tokens": 460259591.0, + "step": 18176 + }, + { + "epoch": 1.9961563804085218, + "grad_norm": 2.3913567066192627, + "learning_rate": 1e-06, + "loss": 0.8099, + "mean_token_accuracy": 0.7410804033279419, + "num_tokens": 460281128.0, + "step": 18177 + }, + { + "epoch": 1.9962661981111354, + "grad_norm": 2.188694953918457, + "learning_rate": 1e-06, + "loss": 0.8717, + "mean_token_accuracy": 0.7252539396286011, + "num_tokens": 460305476.0, + "step": 18178 + }, + { + "epoch": 1.9963760158137491, + "grad_norm": 2.0711562633514404, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.6920263767242432, + "num_tokens": 460334840.0, + "step": 18179 + }, + { + "epoch": 1.9964858335163629, + "grad_norm": 2.1839675903320312, + "learning_rate": 1e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7150447368621826, + "num_tokens": 460360635.0, + "step": 18180 + }, + { + "epoch": 1.9965956512189766, + "grad_norm": 2.1135201454162598, + "learning_rate": 1e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.6986428499221802, + "num_tokens": 460388516.0, + "step": 18181 + }, + { + "epoch": 1.9967054689215902, + "grad_norm": 2.0387771129608154, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7090016007423401, + "num_tokens": 460417022.0, + "step": 18182 + }, + { + "epoch": 1.9968152866242037, + "grad_norm": 2.016866445541382, + "learning_rate": 1e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.7308400869369507, + "num_tokens": 460446935.0, + "step": 18183 + }, + { + "epoch": 1.9969251043268175, + "grad_norm": 2.3726353645324707, + "learning_rate": 1e-06, + "loss": 0.8761, + "mean_token_accuracy": 0.722493052482605, + "num_tokens": 460468968.0, + "step": 18184 + }, + { + "epoch": 1.9970349220294312, + "grad_norm": 2.4681475162506104, + "learning_rate": 1e-06, + "loss": 0.9107, + "mean_token_accuracy": 0.7146066427230835, + "num_tokens": 460490633.0, + "step": 18185 + }, + { + "epoch": 1.9971447397320448, + "grad_norm": 2.116379499435425, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7106343507766724, + "num_tokens": 460518526.0, + "step": 18186 + }, + { + "epoch": 1.9972545574346585, + "grad_norm": 2.054753541946411, + "learning_rate": 1e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.7218565940856934, + "num_tokens": 460549780.0, + "step": 18187 + }, + { + "epoch": 1.997364375137272, + "grad_norm": 2.6369776725769043, + "learning_rate": 1e-06, + "loss": 0.8602, + "mean_token_accuracy": 0.7340186834335327, + "num_tokens": 460570444.0, + "step": 18188 + }, + { + "epoch": 1.9974741928398858, + "grad_norm": 2.4019343852996826, + "learning_rate": 1e-06, + "loss": 0.862, + "mean_token_accuracy": 0.7332385778427124, + "num_tokens": 460594216.0, + "step": 18189 + }, + { + "epoch": 1.9975840105424996, + "grad_norm": 2.2315611839294434, + "learning_rate": 1e-06, + "loss": 0.8724, + "mean_token_accuracy": 0.7274836897850037, + "num_tokens": 460620577.0, + "step": 18190 + }, + { + "epoch": 1.997693828245113, + "grad_norm": 2.09195876121521, + "learning_rate": 1e-06, + "loss": 0.9753, + "mean_token_accuracy": 0.6987954378128052, + "num_tokens": 460649813.0, + "step": 18191 + }, + { + "epoch": 1.9978036459477266, + "grad_norm": 2.5585012435913086, + "learning_rate": 1e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.7185604572296143, + "num_tokens": 460669886.0, + "step": 18192 + }, + { + "epoch": 1.9979134636503404, + "grad_norm": 2.1408989429473877, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7041173577308655, + "num_tokens": 460698437.0, + "step": 18193 + }, + { + "epoch": 1.9980232813529542, + "grad_norm": 2.1103525161743164, + "learning_rate": 1e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.691536545753479, + "num_tokens": 460728050.0, + "step": 18194 + }, + { + "epoch": 1.998133099055568, + "grad_norm": 2.2290096282958984, + "learning_rate": 1e-06, + "loss": 0.8673, + "mean_token_accuracy": 0.7368882298469543, + "num_tokens": 460752840.0, + "step": 18195 + }, + { + "epoch": 1.9982429167581814, + "grad_norm": 2.031341314315796, + "learning_rate": 1e-06, + "loss": 0.9643, + "mean_token_accuracy": 0.7096077799797058, + "num_tokens": 460782902.0, + "step": 18196 + }, + { + "epoch": 1.998352734460795, + "grad_norm": 2.4383134841918945, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.716343104839325, + "num_tokens": 460806906.0, + "step": 18197 + }, + { + "epoch": 1.9984625521634087, + "grad_norm": 2.398326873779297, + "learning_rate": 1e-06, + "loss": 0.7793, + "mean_token_accuracy": 0.7444719076156616, + "num_tokens": 460829469.0, + "step": 18198 + }, + { + "epoch": 1.9985723698660225, + "grad_norm": 2.3234755992889404, + "learning_rate": 1e-06, + "loss": 0.8556, + "mean_token_accuracy": 0.7334151864051819, + "num_tokens": 460852879.0, + "step": 18199 + }, + { + "epoch": 1.998682187568636, + "grad_norm": 2.5567166805267334, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7134186029434204, + "num_tokens": 460874038.0, + "step": 18200 + }, + { + "epoch": 1.9987920052712496, + "grad_norm": 2.3083412647247314, + "learning_rate": 1e-06, + "loss": 0.8259, + "mean_token_accuracy": 0.7393175959587097, + "num_tokens": 460897440.0, + "step": 18201 + }, + { + "epoch": 1.9989018229738633, + "grad_norm": 2.1846723556518555, + "learning_rate": 1e-06, + "loss": 0.9825, + "mean_token_accuracy": 0.700698733329773, + "num_tokens": 460927122.0, + "step": 18202 + }, + { + "epoch": 1.999011640676477, + "grad_norm": 2.4121310710906982, + "learning_rate": 1e-06, + "loss": 0.7557, + "mean_token_accuracy": 0.7560173273086548, + "num_tokens": 460948697.0, + "step": 18203 + }, + { + "epoch": 1.9991214583790908, + "grad_norm": 2.515474796295166, + "learning_rate": 1e-06, + "loss": 0.7917, + "mean_token_accuracy": 0.7479671239852905, + "num_tokens": 460969416.0, + "step": 18204 + }, + { + "epoch": 1.9992312760817044, + "grad_norm": 2.2628731727600098, + "learning_rate": 1e-06, + "loss": 0.9378, + "mean_token_accuracy": 0.7238249182701111, + "num_tokens": 460993751.0, + "step": 18205 + }, + { + "epoch": 1.999341093784318, + "grad_norm": 2.2508726119995117, + "learning_rate": 1e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.7032521963119507, + "num_tokens": 461020676.0, + "step": 18206 + }, + { + "epoch": 1.9994509114869317, + "grad_norm": 2.414285659790039, + "learning_rate": 1e-06, + "loss": 0.8461, + "mean_token_accuracy": 0.7342203259468079, + "num_tokens": 461043917.0, + "step": 18207 + }, + { + "epoch": 1.9995607291895454, + "grad_norm": 2.2290594577789307, + "learning_rate": 1e-06, + "loss": 0.8005, + "mean_token_accuracy": 0.7497156858444214, + "num_tokens": 461068590.0, + "step": 18208 + }, + { + "epoch": 1.9996705468921592, + "grad_norm": 1.872745394706726, + "learning_rate": 1e-06, + "loss": 0.9098, + "mean_token_accuracy": 0.7160134315490723, + "num_tokens": 461102579.0, + "step": 18209 + }, + { + "epoch": 1.9997803645947727, + "grad_norm": 2.46455454826355, + "learning_rate": 1e-06, + "loss": 0.8339, + "mean_token_accuracy": 0.7342631220817566, + "num_tokens": 461124109.0, + "step": 18210 + }, + { + "epoch": 1.9998901822973862, + "grad_norm": 2.3042285442352295, + "learning_rate": 1e-06, + "loss": 0.8874, + "mean_token_accuracy": 0.7224986553192139, + "num_tokens": 461150320.0, + "step": 18211 + }, + { + "epoch": 2.0, + "grad_norm": 2.2201008796691895, + "learning_rate": 1e-06, + "loss": 0.7904, + "mean_token_accuracy": 0.7476388216018677, + "num_tokens": 461174174.0, + "step": 18212 + }, + { + "epoch": 2.0, + "step": 18212, + "total_flos": 2.0766464617520038e+19, + "train_loss": 0.9501410755198944, + "train_runtime": 21390.3581, + "train_samples_per_second": 13.622, + "train_steps_per_second": 0.851 + } + ], + "logging_steps": 1, + "max_steps": 18212, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 9106, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.0766464617520038e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..8ca204d --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf00b0df531e3980d488bc5021ee1cbf5fb3df2c888a137044bc79a619d22e3a +size 13329